aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib
diff options
context:
space:
mode:
authorPawel Jakub Dawidek <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
committerPawel Jakub Dawidek <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
commitf0a75d274af375d15b97b830966b99a02b7db911 (patch)
tree3061c8734d9ce560165e672836837a0f411a83c9 /sys/contrib
parentc8c0ba192e3ac07c3797a1bbca52e5575e019890 (diff)
downloadsrc-f0a75d274af375d15b97b830966b99a02b7db911.tar.gz
src-f0a75d274af375d15b97b830966b99a02b7db911.zip
Please welcome ZFS - The last word in file systems.
ZFS file system was ported from OpenSolaris operating system. The code in under CDDL license. I'd like to thank all SUN developers that created this great piece of software. Supported by: Wheel LTD (http://www.wheel.pl/) Supported by: The FreeBSD Foundation (http://www.freebsdfoundation.org/) Supported by: Sentex (http://www.sentex.net/)
Notes
Notes: svn path=/head/; revision=168404
Diffstat (limited to 'sys/contrib')
-rw-r--r--sys/contrib/opensolaris/common/acl/acl_common.c217
-rw-r--r--sys/contrib/opensolaris/common/acl/acl_common.h56
-rw-r--r--sys/contrib/opensolaris/common/avl/avl.c968
-rw-r--r--sys/contrib/opensolaris/common/nvpair/nvpair.c2953
-rw-r--r--sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c118
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_namecheck.c287
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_namecheck.h56
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_prop.c657
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_prop.h56
-rw-r--r--sys/contrib/opensolaris/uts/common/Makefile.files101
-rw-r--r--sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S564
-rw-r--r--sys/contrib/opensolaris/uts/common/arch/i386/atomic.S659
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/dnlc.c824
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/gfs.c882
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/arc.c2829
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c312
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c2240
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c1029
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c160
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c1034
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c1009
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c888
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c992
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c655
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c1370
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c621
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c1889
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c1192
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c255
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c501
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c196
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c145
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c69
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c129
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c1023
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c194
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c131
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa.c3265
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c361
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c440
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c354
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c1126
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c501
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h109
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h89
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h334
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h586
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h237
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h125
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h120
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h134
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h267
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h185
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h142
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h82
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h77
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h77
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h69
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h81
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h103
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h491
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h168
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h162
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h120
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h77
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h50
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h63
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h56
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h132
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h52
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h46
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h298
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h359
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h204
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h234
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h115
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h122
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h71
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h71
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h162
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h89
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h100
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h298
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h276
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h111
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h366
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h82
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h205
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h68
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/txg.c611
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c63
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/unique.c107
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c1905
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c394
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c363
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c225
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c432
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c1011
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c495
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c89
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c323
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c1223
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c118
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zap.c1070
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c741
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c855
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf28
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c1607
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c99
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c1120
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c796
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c336
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c1811
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c348
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c424
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c594
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c986
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c3227
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1061
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zil.c1607
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio.c1853
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c172
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c148
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c315
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c796
-rw-r--r--sys/contrib/opensolaris/uts/common/os/callb.c363
-rw-r--r--sys/contrib/opensolaris/uts/common/os/list.c193
-rw-r--r--sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c63
-rw-r--r--sys/contrib/opensolaris/uts/common/os/taskq.c1020
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr.c671
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr.h605
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr_array.c123
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c209
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/asm_linkage.h304
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/atomic.h431
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/avl.h298
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/avl_impl.h164
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/bitmap.h194
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/byteorder.h137
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/callb.h214
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/ccompile.h127
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/compress.h46
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/cred.h154
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/debug.h129
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/dkio.h477
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/dklabel.h268
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/dnlc.h232
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/errorq.h83
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/feature_tests.h397
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fm/protocol.h301
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fm/util.h103
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fs/zfs.h434
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/gfs.h139
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/isa_defs.h479
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/list.h63
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/list_impl.h53
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/note.h56
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/nvpair.h260
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h73
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/processor.h146
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/procset.h160
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/sdt.h176
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/synch.h161
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/sysevent.h227
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/sysmacros.h287
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/vfs.h569
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/vmem.h142
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/zmod.h68
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/adler32.c149
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/crc32.c428
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/crc32.h443
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/deflate.c1742
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/deflate.h331
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inffast.c320
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inffast.h13
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inffixed.h96
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inflate.c1395
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inflate.h117
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inftrees.c331
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inftrees.h57
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/trees.c1219
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zconf.h117
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zlib.h1359
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zmod.c109
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c84
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zutil.c324
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zutil.h274
191 files changed, 88133 insertions, 0 deletions
diff --git a/sys/contrib/opensolaris/common/acl/acl_common.c b/sys/contrib/opensolaris/common/acl/acl_common.c
new file mode 100644
index 000000000000..2f32e7a8a78a
--- /dev/null
+++ b/sys/contrib/opensolaris/common/acl/acl_common.c
@@ -0,0 +1,217 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/debug.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <assert.h>
+#define ASSERT assert
+#endif
+
+
+ace_t trivial_acl[] = {
+ {-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
+ {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE},
+ {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_DENIED_ACE_TYPE},
+ {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_ALLOWED_ACE_TYPE},
+ {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE},
+ {-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE}
+};
+
+
+void
+adjust_ace_pair(ace_t *pair, mode_t mode)
+{
+ if (mode & S_IROTH)
+ pair[1].a_access_mask |= ACE_READ_DATA;
+ else
+ pair[0].a_access_mask |= ACE_READ_DATA;
+ if (mode & S_IWOTH)
+ pair[1].a_access_mask |=
+ ACE_WRITE_DATA|ACE_APPEND_DATA;
+ else
+ pair[0].a_access_mask |=
+ ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXOTH)
+ pair[1].a_access_mask |= ACE_EXECUTE;
+ else
+ pair[0].a_access_mask |= ACE_EXECUTE;
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implys that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+ int i;
+ int owner_seen = 0;
+ int group_seen = 0;
+ int everyone_seen = 0;
+
+ for (i = 0; i != aclcnt; i++) {
+ switch (acep[i].a_flags & 0xf040) {
+ case ACE_OWNER:
+ if (group_seen || everyone_seen)
+ return (1);
+ owner_seen++;
+ break;
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ if (everyone_seen || owner_seen == 0)
+ return (1);
+ group_seen++;
+ break;
+
+ case ACE_EVERYONE:
+ if (owner_seen == 0 || group_seen == 0)
+ return (1);
+ everyone_seen++;
+ break;
+ default:
+ return (1);
+
+ }
+
+ if (acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((acep[i].a_access_mask &
+ (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (acep[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Allow on owner@ to allow
+ * write_acl/write_owner/write_attributes
+ */
+ if (acep[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(acep[i].a_flags & ACE_OWNER) && (acep[i].a_access_mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES))))
+ return (1);
+ }
+
+ if ((owner_seen == 0) || (group_seen == 0) || (everyone_seen == 0))
+ return (1);
+
+ return (0);
+}
+
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ * returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)())
+{
+ int g, i, j, ii;
+ unsigned int *p1, *p2;
+ unsigned int tmp;
+
+ /* No work to do */
+ if (v == NULL || n <= 1)
+ return;
+
+ /* Sanity check on arguments */
+ ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+ ASSERT(s > 0);
+ for (g = n / 2; g > 0; g /= 2) {
+ for (i = g; i < n; i++) {
+ for (j = i - g; j >= 0 &&
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
+ p1 = (void *)(v + j * s);
+ p2 = (void *)(v + (j + g) * s);
+ for (ii = 0; ii < s / 4; ii++) {
+ tmp = *p1;
+ *p1++ = *p2;
+ *p2++ = tmp;
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two acls, all fields. Returns:
+ * -1 (less than)
+ * 0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+ aclent_t *x = (aclent_t *)a;
+ aclent_t *y = (aclent_t *)b;
+
+ /* Compare types */
+ if (x->a_type < y->a_type)
+ return (-1);
+ if (x->a_type > y->a_type)
+ return (1);
+ /* Equal types; compare id's */
+ if (x->a_id < y->a_id)
+ return (-1);
+ if (x->a_id > y->a_id)
+ return (1);
+ /* Equal ids; compare perms */
+ if (x->a_perm < y->a_perm)
+ return (-1);
+ if (x->a_perm > y->a_perm)
+ return (1);
+ /* Totally equal */
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/common/acl/acl_common.h b/sys/contrib/opensolaris/common/acl/acl_common.h
new file mode 100644
index 000000000000..2227ad77ea93
--- /dev/null
+++ b/sys/contrib/opensolaris/common/acl/acl_common.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ACL_ACL_UTILS_H
+#define _ACL_ACL_UTILS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern ace_t trivial_acl[6];
+
+extern int acltrivial(const char *);
+extern void adjust_ace_pair(ace_t *pair, mode_t mode);
+extern int ace_trivial(ace_t *acep, int aclcnt);
+void ksort(caddr_t v, int n, int s, int (*f)());
+int cmp2acls(void *a, void *b);
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ACL_ACL_UTILS_H */
diff --git a/sys/contrib/opensolaris/common/avl/avl.c b/sys/contrib/opensolaris/common/avl/avl.c
new file mode 100644
index 000000000000..bbdbe08c4029
--- /dev/null
+++ b/sys/contrib/opensolaris/common/avl/avl.c
@@ -0,0 +1,968 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+/*
+ * AVL - generic AVL tree implementation for kernel use
+ *
+ * A complete description of AVL trees can be found in many CS textbooks.
+ *
+ * Here is a very brief overview. An AVL tree is a binary search tree that is
+ * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
+ * any given node, the left and right subtrees are allowed to differ in height
+ * by at most 1 level.
+ *
+ * This relaxation from a perfectly balanced binary tree allows doing
+ * insertion and deletion relatively efficiently. Searching the tree is
+ * still a fast operation, roughly O(log(N)).
+ *
+ * The key to insertion and deletion is a set of tree maniuplations called
+ * rotations, which bring unbalanced subtrees back into the semi-balanced state.
+ *
+ * This implementation of AVL trees has the following peculiarities:
+ *
+ * - The AVL specific data structures are physically embedded as fields
+ * in the "using" data structures. To maintain generality the code
+ * must constantly translate between "avl_node_t *" and containing
+ * data structure "void *"s by adding/subracting the avl_offset.
+ *
+ * - Since the AVL data is always embedded in other structures, there is
+ * no locking or memory allocation in the AVL routines. This must be
+ * provided for by the enclosing data structure's semantics. Typically,
+ * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
+ * exclusive write lock. Other operations require a read lock.
+ *
+ * - The implementation uses iteration instead of explicit recursion,
+ * since it is intended to run on limited size kernel stacks. Since
+ * there is no recursion stack present to move "up" in the tree,
+ * there is an explicit "parent" link in the avl_node_t.
+ *
+ * - The left/right children pointers of a node are in an array.
+ * In the code, variables (instead of constants) are used to represent
+ * left and right indices. The implementation is written as if it only
+ * dealt with left handed manipulations. By changing the value assigned
+ * to "left", the code also works for right handed trees. The
+ * following variables/terms are frequently used:
+ *
+ * int left; // 0 when dealing with left children,
+ * // 1 for dealing with right children
+ *
+ * int left_heavy; // -1 when left subtree is taller at some node,
+ * // +1 when right subtree is taller
+ *
+ * int right; // will be the opposite of left (0 or 1)
+ * int right_heavy;// will be the opposite of left_heavy (-1 or 1)
+ *
+ * int direction; // 0 for "<" (ie. left child); 1 for ">" (right)
+ *
+ * Though it is a little more confusing to read the code, the approach
+ * allows using half as much code (and hence cache footprint) for tree
+ * manipulations and eliminates many conditional branches.
+ *
+ * - The avl_index_t is an opaque "cookie" used to find nodes at or
+ * adjacent to where a new value would be inserted in the tree. The value
+ * is a modified "avl_node_t *". The bottom bit (normally 0 for a
+ * pointer) is set to indicate if that the new node has a value greater
+ * than the value of the indicated "avl_node_t *".
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/debug.h>
+#include <sys/avl.h>
+
+/*
+ * Small arrays to translate between balance (or diff) values and child indeces.
+ *
+ * Code that deals with binary tree data structures will randomly use
+ * left and right children when examining a tree. C "if()" statements
+ * which evaluate randomly suffer from very poor hardware branch prediction.
+ * In this code we avoid some of the branch mispredictions by using the
+ * following translation arrays. They replace random branches with an
+ * additional memory reference. Since the translation arrays are both very
+ * small the data should remain efficiently in cache.
+ */
+static const int avl_child2balance[2] = {-1, 1};
+static const int avl_balance2child[] = {0, 0, 1};
+
+
+/*
+ * Walk from one node to the previous valued node (ie. an infix walk
+ * towards the left). At any given node we do one of 2 things:
+ *
+ * - If there is a left child, go to it, then to it's rightmost descendant.
+ *
+ * - otherwise we return thru parent nodes until we've come from a right child.
+ *
+ * Return Value:
+ * NULL - if at the end of the nodes
+ * otherwise next node
+ */
+void *
+avl_walk(avl_tree_t *tree, void *oldnode, int left)
+{
+ size_t off = tree->avl_offset;
+ avl_node_t *node = AVL_DATA2NODE(oldnode, off);
+ int right = 1 - left;
+ int was_child;
+
+
+ /*
+ * nowhere to walk to if tree is empty
+ */
+ if (node == NULL)
+ return (NULL);
+
+ /*
+ * Visit the previous valued node. There are two possibilities:
+ *
+ * If this node has a left child, go down one left, then all
+ * the way right.
+ */
+ if (node->avl_child[left] != NULL) {
+ for (node = node->avl_child[left];
+ node->avl_child[right] != NULL;
+ node = node->avl_child[right])
+ ;
+ /*
+ * Otherwise, return thru left children as far as we can.
+ */
+ } else {
+ for (;;) {
+ was_child = AVL_XCHILD(node);
+ node = AVL_XPARENT(node);
+ if (node == NULL)
+ return (NULL);
+ if (was_child == right)
+ break;
+ }
+ }
+
+ return (AVL_NODE2DATA(node, off));
+}
+
+/*
+ * Return the lowest valued node in a tree or NULL.
+ * (leftmost child from root of tree)
+ */
+void *
+avl_first(avl_tree_t *tree)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
+ prev = node;
+
+ if (prev != NULL)
+ return (AVL_NODE2DATA(prev, off));
+ return (NULL);
+}
+
+/*
+ * Return the highest valued node in a tree or NULL.
+ * (rightmost child from root of tree)
+ */
+void *
+avl_last(avl_tree_t *tree)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
+ prev = node;
+
+ if (prev != NULL)
+ return (AVL_NODE2DATA(prev, off));
+ return (NULL);
+}
+
+/*
+ * Access the node immediately before or after an insertion point.
+ *
+ * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
+ *
+ * Return value:
+ * NULL: no node in the given direction
+ * "void *" of the found tree node
+ */
+void *
+avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
+{
+ int child = AVL_INDEX2CHILD(where);
+ avl_node_t *node = AVL_INDEX2NODE(where);
+ void *data;
+ size_t off = tree->avl_offset;
+
+ if (node == NULL) {
+ ASSERT(tree->avl_root == NULL);
+ return (NULL);
+ }
+ data = AVL_NODE2DATA(node, off);
+ if (child != direction)
+ return (data);
+
+ return (avl_walk(tree, data, direction));
+}
+
+
+/*
+ * Search for the node which contains "value". The algorithm is a
+ * simple binary tree search.
+ *
+ * return value:
+ * NULL: the value is not in the AVL tree
+ * *where (if not NULL) is set to indicate the insertion point
+ * "void *" of the found tree node
+ */
+void *
+avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ int child = 0;
+ int diff;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL;
+ node = node->avl_child[child]) {
+
+ prev = node;
+
+ diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
+ ASSERT(-1 <= diff && diff <= 1);
+ if (diff == 0) {
+#ifdef DEBUG
+ if (where != NULL)
+ *where = 0;
+#endif
+ return (AVL_NODE2DATA(node, off));
+ }
+ child = avl_balance2child[1 + diff];
+
+ }
+
+ if (where != NULL)
+ *where = AVL_MKINDEX(prev, child);
+
+ return (NULL);
+}
+
+
+/*
+ * Perform a rotation to restore balance at the subtree given by depth.
+ *
+ * This routine is used by both insertion and deletion. The return value
+ * indicates:
+ * 0 : subtree did not change height
+ * !0 : subtree was reduced in height
+ *
+ * The code is written as if handling left rotations, right rotations are
+ * symmetric and handled by swapping values of variables right/left[_heavy]
+ *
+ * On input balance is the "new" balance at "node". This value is either
+ * -2 or +2.
+ */
+static int
+avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
+{
+ int left = !(balance < 0); /* when balance = -2, left will be 0 */
+ int right = 1 - left;
+ int left_heavy = balance >> 1;
+ int right_heavy = -left_heavy;
+ avl_node_t *parent = AVL_XPARENT(node);
+ avl_node_t *child = node->avl_child[left];
+ avl_node_t *cright;
+ avl_node_t *gchild;
+ avl_node_t *gright;
+ avl_node_t *gleft;
+ int which_child = AVL_XCHILD(node);
+ int child_bal = AVL_XBALANCE(child);
+
+ /* BEGIN CSTYLED */
+ /*
+ * case 1 : node is overly left heavy, the left child is balanced or
+ * also left heavy. This requires the following rotation.
+ *
+ * (node bal:-2)
+ * / \
+ * / \
+ * (child bal:0 or -1)
+ * / \
+ * / \
+ * cright
+ *
+ * becomes:
+ *
+ * (child bal:1 or 0)
+ * / \
+ * / \
+ * (node bal:-1 or 0)
+ * / \
+ * / \
+ * cright
+ *
+ * we detect this situation by noting that child's balance is not
+ * right_heavy.
+ */
+ /* END CSTYLED */
+ if (child_bal != right_heavy) {
+
+ /*
+ * compute new balance of nodes
+ *
+ * If child used to be left heavy (now balanced) we reduced
+ * the height of this sub-tree -- used in "return...;" below
+ */
+ child_bal += right_heavy; /* adjust towards right */
+
+ /*
+ * move "cright" to be node's left child
+ */
+ cright = child->avl_child[right];
+ node->avl_child[left] = cright;
+ if (cright != NULL) {
+ AVL_SETPARENT(cright, node);
+ AVL_SETCHILD(cright, left);
+ }
+
+ /*
+ * move node to be child's right child
+ */
+ child->avl_child[right] = node;
+ AVL_SETBALANCE(node, -child_bal);
+ AVL_SETCHILD(node, right);
+ AVL_SETPARENT(node, child);
+
+ /*
+ * update the pointer into this subtree
+ */
+ AVL_SETBALANCE(child, child_bal);
+ AVL_SETCHILD(child, which_child);
+ AVL_SETPARENT(child, parent);
+ if (parent != NULL)
+ parent->avl_child[which_child] = child;
+ else
+ tree->avl_root = child;
+
+ return (child_bal == 0);
+ }
+
+ /* BEGIN CSTYLED */
+ /*
+ * case 2 : When node is left heavy, but child is right heavy we use
+ * a different rotation.
+ *
+ * (node b:-2)
+ * / \
+ * / \
+ * / \
+ * (child b:+1)
+ * / \
+ * / \
+ * (gchild b: != 0)
+ * / \
+ * / \
+ * gleft gright
+ *
+ * becomes:
+ *
+ * (gchild b:0)
+ * / \
+ * / \
+ * / \
+ * (child b:?) (node b:?)
+ * / \ / \
+ * / \ / \
+ * gleft gright
+ *
+ * computing the new balances is more complicated. As an example:
+ * if gchild was right_heavy, then child is now left heavy
+ * else it is balanced
+ */
+ /* END CSTYLED */
+ gchild = child->avl_child[right];
+ gleft = gchild->avl_child[left];
+ gright = gchild->avl_child[right];
+
+ /*
+ * move gright to left child of node and
+ *
+ * move gleft to right child of node
+ */
+ node->avl_child[left] = gright;
+ if (gright != NULL) {
+ AVL_SETPARENT(gright, node);
+ AVL_SETCHILD(gright, left);
+ }
+
+ child->avl_child[right] = gleft;
+ if (gleft != NULL) {
+ AVL_SETPARENT(gleft, child);
+ AVL_SETCHILD(gleft, right);
+ }
+
+ /*
+ * move child to left child of gchild and
+ *
+ * move node to right child of gchild and
+ *
+ * fixup parent of all this to point to gchild
+ */
+ balance = AVL_XBALANCE(gchild);
+ gchild->avl_child[left] = child;
+ AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
+ AVL_SETPARENT(child, gchild);
+ AVL_SETCHILD(child, left);
+
+ gchild->avl_child[right] = node;
+ AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
+ AVL_SETPARENT(node, gchild);
+ AVL_SETCHILD(node, right);
+
+ AVL_SETBALANCE(gchild, 0);
+ AVL_SETPARENT(gchild, parent);
+ AVL_SETCHILD(gchild, which_child);
+ if (parent != NULL)
+ parent->avl_child[which_child] = gchild;
+ else
+ tree->avl_root = gchild;
+
+ return (1); /* the new tree is always shorter */
+}
+
+
+/*
+ * Insert a new node into an AVL tree at the specified (from avl_find()) place.
+ *
+ * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
+ * searches out to the leaf positions. The avl_index_t indicates the node
+ * which will be the parent of the new node.
+ *
+ * After the node is inserted, a single rotation further up the tree may
+ * be necessary to maintain an acceptable AVL balance.
+ */
+void
+avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
+{
+ avl_node_t *node;
+ avl_node_t *parent = AVL_INDEX2NODE(where);
+ int old_balance;
+ int new_balance;
+ int which_child = AVL_INDEX2CHILD(where);
+ size_t off = tree->avl_offset;
+
+ ASSERT(tree);
+#ifdef _LP64
+ ASSERT(((uintptr_t)new_data & 0x7) == 0);
+#endif
+
+ node = AVL_DATA2NODE(new_data, off);
+
+ /*
+ * First, add the node to the tree at the indicated position.
+ */
+ ++tree->avl_numnodes;
+
+ node->avl_child[0] = NULL;
+ node->avl_child[1] = NULL;
+
+ AVL_SETCHILD(node, which_child);
+ AVL_SETBALANCE(node, 0);
+ AVL_SETPARENT(node, parent);
+ if (parent != NULL) {
+ ASSERT(parent->avl_child[which_child] == NULL);
+ parent->avl_child[which_child] = node;
+ } else {
+ ASSERT(tree->avl_root == NULL);
+ tree->avl_root = node;
+ }
+ /*
+ * Now, back up the tree modifying the balance of all nodes above the
+ * insertion point. If we get to a highly unbalanced ancestor, we
+ * need to do a rotation. If we back out of the tree we are done.
+ * If we brought any subtree into perfect balance (0), we are also done.
+ */
+ for (;;) {
+ node = parent;
+ if (node == NULL)
+ return;
+
+ /*
+ * Compute the new balance
+ */
+ old_balance = AVL_XBALANCE(node);
+ new_balance = old_balance + avl_child2balance[which_child];
+
+ /*
+ * If we introduced equal balance, then we are done immediately
+ */
+ if (new_balance == 0) {
+ AVL_SETBALANCE(node, 0);
+ return;
+ }
+
+ /*
+ * If both old and new are not zero we went
+ * from -1 to -2 balance, do a rotation.
+ */
+ if (old_balance != 0)
+ break;
+
+ AVL_SETBALANCE(node, new_balance);
+ parent = AVL_XPARENT(node);
+ which_child = AVL_XCHILD(node);
+ }
+
+ /*
+ * perform a rotation to fix the tree and return
+ */
+ (void) avl_rotation(tree, node, new_balance);
+}
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after or
+ * before (AVL_AFTER, AVL_BEFORE) the data "here".
+ *
+ * Insertions can only be done at empty leaf points in the tree, therefore
+ * if the given child of the node is already present we move to either
+ * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
+ * every other node in the tree is a leaf, this always works.
+ *
+ * To help developers using this interface, we assert that the new node
+ * is correctly ordered at every step of the way in DEBUG kernels.
+ */
+void
+avl_insert_here(
+ avl_tree_t *tree,
+ void *new_data,
+ void *here,
+ int direction)
+{
+ avl_node_t *node;
+ int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
+#ifdef DEBUG
+ int diff;
+#endif
+
+ ASSERT(tree != NULL);
+ ASSERT(new_data != NULL);
+ ASSERT(here != NULL);
+ ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
+
+ /*
+ * If corresponding child of node is not NULL, go to the neighboring
+ * node and reverse the insertion direction.
+ */
+ node = AVL_DATA2NODE(here, tree->avl_offset);
+
+#ifdef DEBUG
+ diff = tree->avl_compar(new_data, here);
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+
+ if (node->avl_child[child] != NULL) {
+ node = node->avl_child[child];
+ child = 1 - child;
+ while (node->avl_child[child] != NULL) {
+#ifdef DEBUG
+ diff = tree->avl_compar(new_data,
+ AVL_NODE2DATA(node, tree->avl_offset));
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+ node = node->avl_child[child];
+ }
+#ifdef DEBUG
+ diff = tree->avl_compar(new_data,
+ AVL_NODE2DATA(node, tree->avl_offset));
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+ }
+ ASSERT(node->avl_child[child] == NULL);
+
+ avl_insert(tree, new_data, AVL_MKINDEX(node, child));
+}
+
+/*
+ * Add a new node to an AVL tree.
+ */
+void
+avl_add(avl_tree_t *tree, void *new_node)
+{
+ avl_index_t where;
+
+ /*
+ * This is unfortunate. We want to call panic() here, even for
+ * non-DEBUG kernels. In userland, however, we can't depend on anything
+ * in libc or else the rtld build process gets confused. So, all we can
+ * do in userland is resort to a normal ASSERT().
+ */
+ if (avl_find(tree, new_node, &where) != NULL)
+#ifdef _KERNEL
+ panic("avl_find() succeeded inside avl_add()");
+#else
+ ASSERT(0);
+#endif
+ avl_insert(tree, new_node, where);
+}
+
+/*
+ * Delete a node from the AVL tree. Deletion is similar to insertion, but
+ * with 2 complications.
+ *
+ * First, we may be deleting an interior node. Consider the following subtree:
+ *
+ * d c c
+ * / \ / \ / \
+ * b e b e b e
+ * / \ / \ /
+ * a c a a
+ *
+ * When we are deleting node (d), we find and bring up an adjacent valued leaf
+ * node, say (c), to take the interior node's place. In the code this is
+ * handled by temporarily swapping (d) and (c) in the tree and then using
+ * common code to delete (d) from the leaf position.
+ *
+ * Secondly, an interior deletion from a deep tree may require more than one
+ * rotation to fix the balance. This is handled by moving up the tree through
+ * parents and applying rotations as needed. The return value from
+ * avl_rotation() is used to detect when a subtree did not change overall
+ * height due to a rotation.
+ */
+void
+avl_remove(avl_tree_t *tree, void *data)
+{
+ avl_node_t *delete;
+ avl_node_t *parent;
+ avl_node_t *node;
+ avl_node_t tmp;
+ int old_balance;
+ int new_balance;
+ int left;
+ int right;
+ int which_child;
+ size_t off = tree->avl_offset;
+
+ ASSERT(tree);
+
+ delete = AVL_DATA2NODE(data, off);
+
+ /*
+ * Deletion is easiest with a node that has at most 1 child.
+ * We swap a node with 2 children with a sequentially valued
+ * neighbor node. That node will have at most 1 child. Note this
+ * has no effect on the ordering of the remaining nodes.
+ *
+ * As an optimization, we choose the greater neighbor if the tree
+ * is right heavy, otherwise the left neighbor. This reduces the
+ * number of rotations needed.
+ */
+ if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
+
+ /*
+ * choose node to swap from whichever side is taller
+ */
+ old_balance = AVL_XBALANCE(delete);
+ left = avl_balance2child[old_balance + 1];
+ right = 1 - left;
+
+ /*
+ * get to the previous value'd node
+ * (down 1 left, as far as possible right)
+ */
+ for (node = delete->avl_child[left];
+ node->avl_child[right] != NULL;
+ node = node->avl_child[right])
+ ;
+
+ /*
+ * create a temp placeholder for 'node'
+ * move 'node' to delete's spot in the tree
+ */
+ tmp = *node;
+
+ *node = *delete;
+ if (node->avl_child[left] == node)
+ node->avl_child[left] = &tmp;
+
+ parent = AVL_XPARENT(node);
+ if (parent != NULL)
+ parent->avl_child[AVL_XCHILD(node)] = node;
+ else
+ tree->avl_root = node;
+ AVL_SETPARENT(node->avl_child[left], node);
+ AVL_SETPARENT(node->avl_child[right], node);
+
+ /*
+ * Put tmp where node used to be (just temporary).
+ * It always has a parent and at most 1 child.
+ */
+ delete = &tmp;
+ parent = AVL_XPARENT(delete);
+ parent->avl_child[AVL_XCHILD(delete)] = delete;
+ which_child = (delete->avl_child[1] != 0);
+ if (delete->avl_child[which_child] != NULL)
+ AVL_SETPARENT(delete->avl_child[which_child], delete);
+ }
+
+
+ /*
+ * Here we know "delete" is at least partially a leaf node. It can
+ * be easily removed from the tree.
+ */
+ ASSERT(tree->avl_numnodes > 0);
+ --tree->avl_numnodes;
+ parent = AVL_XPARENT(delete);
+ which_child = AVL_XCHILD(delete);
+ if (delete->avl_child[0] != NULL)
+ node = delete->avl_child[0];
+ else
+ node = delete->avl_child[1];
+
+ /*
+ * Connect parent directly to node (leaving out delete).
+ */
+ if (node != NULL) {
+ AVL_SETPARENT(node, parent);
+ AVL_SETCHILD(node, which_child);
+ }
+ if (parent == NULL) {
+ tree->avl_root = node;
+ return;
+ }
+ parent->avl_child[which_child] = node;
+
+
+ /*
+ * Since the subtree is now shorter, begin adjusting parent balances
+ * and performing any needed rotations.
+ */
+ do {
+
+ /*
+ * Move up the tree and adjust the balance
+ *
+ * Capture the parent and which_child values for the next
+ * iteration before any rotations occur.
+ */
+ node = parent;
+ old_balance = AVL_XBALANCE(node);
+ new_balance = old_balance - avl_child2balance[which_child];
+ parent = AVL_XPARENT(node);
+ which_child = AVL_XCHILD(node);
+
+ /*
+ * If a node was in perfect balance but isn't anymore then
+ * we can stop, since the height didn't change above this point
+ * due to a deletion.
+ */
+ if (old_balance == 0) {
+ AVL_SETBALANCE(node, new_balance);
+ break;
+ }
+
+ /*
+ * If the new balance is zero, we don't need to rotate
+ * else
+ * need a rotation to fix the balance.
+ * If the rotation doesn't change the height
+ * of the sub-tree we have finished adjusting.
+ */
+ if (new_balance == 0)
+ AVL_SETBALANCE(node, new_balance);
+ else if (!avl_rotation(tree, node, new_balance))
+ break;
+ } while (parent != NULL);
+}
+
+/*
+ * initialize a new AVL tree
+ */
+void
+avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
+ size_t size, size_t offset)
+{
+ ASSERT(tree);
+ ASSERT(compar);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (avl_node_t));
+#ifdef _LP64
+ ASSERT((offset & 0x7) == 0);
+#endif
+
+ tree->avl_compar = compar;
+ tree->avl_root = NULL;
+ tree->avl_numnodes = 0;
+ tree->avl_size = size;
+ tree->avl_offset = offset;
+}
+
+/*
+ * Delete a tree.
+ */
+/* ARGSUSED */
+void
+avl_destroy(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ ASSERT(tree->avl_numnodes == 0);
+ ASSERT(tree->avl_root == NULL);
+}
+
+
+/*
+ * Return the number of nodes in an AVL tree.
+ */
+ulong_t
+avl_numnodes(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes);
+}
+
+
+#define CHILDBIT (1L)
+
+/*
+ * Post-order tree walk used to visit all tree nodes and destroy the tree
+ * in post order. This is used for destroying a tree w/o paying any cost
+ * for rebalancing it.
+ *
+ * example:
+ *
+ * void *cookie = NULL;
+ * my_data_t *node;
+ *
+ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node);
+ * avl_destroy(tree);
+ *
+ * The cookie is really an avl_node_t to the current node's parent and
+ * an indication of which child you looked at last.
+ *
+ * On input, a cookie value of CHILDBIT indicates the tree is done.
+ */
+void *
+avl_destroy_nodes(avl_tree_t *tree, void **cookie)
+{
+ avl_node_t *node;
+ avl_node_t *parent;
+ int child;
+ void *first;
+ size_t off = tree->avl_offset;
+
+ /*
+ * Initial calls go to the first node or it's right descendant.
+ */
+ if (*cookie == NULL) {
+ first = avl_first(tree);
+
+ /*
+ * deal with an empty tree
+ */
+ if (first == NULL) {
+ *cookie = (void *)CHILDBIT;
+ return (NULL);
+ }
+
+ node = AVL_DATA2NODE(first, off);
+ parent = AVL_XPARENT(node);
+ goto check_right_side;
+ }
+
+ /*
+ * If there is no parent to return to we are done.
+ */
+ parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
+ if (parent == NULL) {
+ if (tree->avl_root != NULL) {
+ ASSERT(tree->avl_numnodes == 1);
+ tree->avl_root = NULL;
+ tree->avl_numnodes = 0;
+ }
+ return (NULL);
+ }
+
+ /*
+ * Remove the child pointer we just visited from the parent and tree.
+ */
+ child = (uintptr_t)(*cookie) & CHILDBIT;
+ parent->avl_child[child] = NULL;
+ ASSERT(tree->avl_numnodes > 1);
+ --tree->avl_numnodes;
+
+ /*
+ * If we just did a right child or there isn't one, go up to parent.
+ */
+ if (child == 1 || parent->avl_child[1] == NULL) {
+ node = parent;
+ parent = AVL_XPARENT(parent);
+ goto done;
+ }
+
+ /*
+ * Do parent's right child, then leftmost descendent.
+ */
+ node = parent->avl_child[1];
+ while (node->avl_child[0] != NULL) {
+ parent = node;
+ node = node->avl_child[0];
+ }
+
+ /*
+ * If here, we moved to a left child. It may have one
+ * child on the right (when balance == +1).
+ */
+check_right_side:
+ if (node->avl_child[1] != NULL) {
+ ASSERT(AVL_XBALANCE(node) == 1);
+ parent = node;
+ node = node->avl_child[1];
+ ASSERT(node->avl_child[0] == NULL &&
+ node->avl_child[1] == NULL);
+ } else {
+ ASSERT(AVL_XBALANCE(node) <= 0);
+ }
+
+done:
+ if (parent == NULL) {
+ *cookie = (void *)CHILDBIT;
+ ASSERT(node == tree->avl_root);
+ } else {
+ *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
+ }
+
+ return (AVL_NODE2DATA(node, off));
+}
diff --git a/sys/contrib/opensolaris/common/nvpair/nvpair.c b/sys/contrib/opensolaris/common/nvpair/nvpair.c
new file mode 100644
index 000000000000..a1e61904e68b
--- /dev/null
+++ b/sys/contrib/opensolaris/common/nvpair/nvpair.c
@@ -0,0 +1,2953 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/debug.h>
+#include <sys/nvpair.h>
+#include <sys/nvpair_impl.h>
+#include <rpc/types.h>
+#include <rpc/xdr.h>
+
+#if defined(_KERNEL) && !defined(_BOOT)
+#include <sys/varargs.h>
+#else
+#include <stdarg.h>
+#include <strings.h>
+#endif
+
+#ifndef offsetof
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+
+
+/*
+ * nvpair.c - Provides kernel & userland interfaces for manipulating
+ * name-value pairs.
+ *
+ * Overview Diagram
+ *
+ * +--------------+
+ * | nvlist_t |
+ * |--------------|
+ * | nvl_version |
+ * | nvl_nvflag |
+ * | nvl_priv -+-+
+ * | nvl_flag | |
+ * | nvl_pad | |
+ * +--------------+ |
+ * V
+ * +--------------+ last i_nvp in list
+ * | nvpriv_t | +--------------------->
+ * |--------------| |
+ * +--+- nvp_list | | +------------+
+ * | | nvp_last -+--+ + nv_alloc_t |
+ * | | nvp_curr | |------------|
+ * | | nvp_nva -+----> | nva_ops |
+ * | | nvp_stat | | nva_arg |
+ * | +--------------+ +------------+
+ * |
+ * +-------+
+ * V
+ * +---------------------+ +-------------------+
+ * | i_nvp_t | +-->| i_nvp_t | +-->
+ * |---------------------| | |-------------------| |
+ * | nvi_next -+--+ | nvi_next -+--+
+ * | nvi_prev (NULL) | <----+ nvi_prev |
+ * | . . . . . . . . . . | | . . . . . . . . . |
+ * | nvp (nvpair_t) | | nvp (nvpair_t) |
+ * | - nvp_size | | - nvp_size |
+ * | - nvp_name_sz | | - nvp_name_sz |
+ * | - nvp_value_elem | | - nvp_value_elem |
+ * | - nvp_type | | - nvp_type |
+ * | - data ... | | - data ... |
+ * +---------------------+ +-------------------+
+ *
+ *
+ *
+ * +---------------------+ +---------------------+
+ * | i_nvp_t | +--> +-->| i_nvp_t (last) |
+ * |---------------------| | | |---------------------|
+ * | nvi_next -+--+ ... --+ | nvi_next (NULL) |
+ * <-+- nvi_prev |<-- ... <----+ nvi_prev |
+ * | . . . . . . . . . | | . . . . . . . . . |
+ * | nvp (nvpair_t) | | nvp (nvpair_t) |
+ * | - nvp_size | | - nvp_size |
+ * | - nvp_name_sz | | - nvp_name_sz |
+ * | - nvp_value_elem | | - nvp_value_elem |
+ * | - DATA_TYPE_NVLIST | | - nvp_type |
+ * | - data (embedded) | | - data ... |
+ * | nvlist name | +---------------------+
+ * | +--------------+ |
+ * | | nvlist_t | |
+ * | |--------------| |
+ * | | nvl_version | |
+ * | | nvl_nvflag | |
+ * | | nvl_priv --+---+---->
+ * | | nvl_flag | |
+ * | | nvl_pad | |
+ * | +--------------+ |
+ * +---------------------+
+ *
+ *
+ * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
+ * allow value to be aligned on 8 byte boundary
+ *
+ * name_len is the length of the name string including the null terminator
+ * so it must be >= 1
+ */
+#define NVP_SIZE_CALC(name_len, data_len) \
+ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
+
+static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
+static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint_t nelem, const void *data);
+
+#define NV_STAT_EMBEDDED 0x1
+#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp))
+#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp))
+
+#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
+#define NVPAIR2I_NVP(nvp) \
+ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
+
+
+int
+nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
+{
+ va_list valist;
+ int err = 0;
+
+ nva->nva_ops = nvo;
+ nva->nva_arg = NULL;
+
+ va_start(valist, nvo);
+ if (nva->nva_ops->nv_ao_init != NULL)
+ err = nva->nva_ops->nv_ao_init(nva, valist);
+ va_end(valist);
+
+ return (err);
+}
+
+void
+nv_alloc_reset(nv_alloc_t *nva)
+{
+ if (nva->nva_ops->nv_ao_reset != NULL)
+ nva->nva_ops->nv_ao_reset(nva);
+}
+
+void
+nv_alloc_fini(nv_alloc_t *nva)
+{
+ if (nva->nva_ops->nv_ao_fini != NULL)
+ nva->nva_ops->nv_ao_fini(nva);
+}
+
+nv_alloc_t *
+nvlist_lookup_nv_alloc(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ return (priv->nvp_nva);
+}
+
+static void *
+nv_mem_zalloc(nvpriv_t *nvp, size_t size)
+{
+ nv_alloc_t *nva = nvp->nvp_nva;
+ void *buf;
+
+ if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
+ bzero(buf, size);
+
+ return (buf);
+}
+
+static void
+nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
+{
+ nv_alloc_t *nva = nvp->nvp_nva;
+
+ nva->nva_ops->nv_ao_free(nva, buf, size);
+}
+
+static void
+nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
+{
+ bzero(priv, sizeof (priv));
+
+ priv->nvp_nva = nva;
+ priv->nvp_stat = stat;
+}
+
+static nvpriv_t *
+nv_priv_alloc(nv_alloc_t *nva)
+{
+ nvpriv_t *priv;
+
+ /*
+ * nv_mem_alloc() cannot called here because it needs the priv
+ * argument.
+ */
+ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
+ return (NULL);
+
+ nv_priv_init(priv, nva, 0);
+
+ return (priv);
+}
+
+/*
+ * Embedded lists need their own nvpriv_t's. We create a new
+ * nvpriv_t using the parameters and allocator from the parent
+ * list's nvpriv_t.
+ */
+static nvpriv_t *
+nv_priv_alloc_embedded(nvpriv_t *priv)
+{
+ nvpriv_t *emb_priv;
+
+ if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
+ return (NULL);
+
+ nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
+
+ return (emb_priv);
+}
+
+static void
+nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
+{
+ nvl->nvl_version = NV_VERSION;
+ nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
+ nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
+ nvl->nvl_flag = 0;
+ nvl->nvl_pad = 0;
+}
+
+/*
+ * nvlist_alloc - Allocate nvlist.
+ */
+/*ARGSUSED1*/
+int
+nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xalloc(nvlp, nvflag,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
+{
+ nvpriv_t *priv;
+
+ if (nvlp == NULL || nva == NULL)
+ return (EINVAL);
+
+ if ((priv = nv_priv_alloc(nva)) == NULL)
+ return (ENOMEM);
+
+ if ((*nvlp = nv_mem_zalloc(priv,
+ NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
+ nv_mem_free(priv, priv, sizeof (nvpriv_t));
+ return (ENOMEM);
+ }
+
+ nvlist_init(*nvlp, nvflag, priv);
+
+ return (0);
+}
+
+/*
+ * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
+ */
+static nvpair_t *
+nvp_buf_alloc(nvlist_t *nvl, size_t len)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *buf;
+ nvpair_t *nvp;
+ size_t nvsize;
+
+ /*
+ * Allocate the buffer
+ */
+ nvsize = len + offsetof(i_nvp_t, nvi_nvp);
+
+ if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
+ return (NULL);
+
+ nvp = &buf->nvi_nvp;
+ nvp->nvp_size = len;
+
+ return (nvp);
+}
+
+/*
+ * nvp_buf_free - de-Allocate an i_nvp_t.
+ */
+static void
+nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
+
+ nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
+}
+
+/*
+ * nvp_buf_link - link a new nv pair into the nvlist.
+ */
+static void
+nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+ /* Put element at end of nvlist */
+ if (priv->nvp_list == NULL) {
+ priv->nvp_list = priv->nvp_last = curr;
+ } else {
+ curr->nvi_prev = priv->nvp_last;
+ priv->nvp_last->nvi_next = curr;
+ priv->nvp_last = curr;
+ }
+}
+
+/*
+ * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
+ */
+static void
+nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+ /*
+ * protect nvlist_next_nvpair() against walking on freed memory.
+ */
+ if (priv->nvp_curr == curr)
+ priv->nvp_curr = curr->nvi_next;
+
+ if (curr == priv->nvp_list)
+ priv->nvp_list = curr->nvi_next;
+ else
+ curr->nvi_prev->nvi_next = curr->nvi_next;
+
+ if (curr == priv->nvp_last)
+ priv->nvp_last = curr->nvi_prev;
+ else
+ curr->nvi_next->nvi_prev = curr->nvi_prev;
+}
+
+/*
+ * take a nvpair type and number of elements and make sure the are valid
+ */
+static int
+i_validate_type_nelem(data_type_t type, uint_t nelem)
+{
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ if (nelem != 0)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_NVLIST:
+ if (nelem != 1)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ /* we allow arrays with 0 elements */
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Verify nvp_name_sz and check the name string length.
+ */
+static int
+i_validate_nvpair_name(nvpair_t *nvp)
+{
+ if ((nvp->nvp_name_sz <= 0) ||
+ (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
+ return (EFAULT);
+
+ /* verify the name string, make sure its terminated */
+ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
+ return (EFAULT);
+
+ return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
+}
+
+static int
+i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
+{
+ switch (type) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ if (*(boolean_t *)data != B_TRUE &&
+ *(boolean_t *)data != B_FALSE)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY: {
+ int i;
+
+ for (i = 0; i < nelem; i++)
+ if (((boolean_t *)data)[i] != B_TRUE &&
+ ((boolean_t *)data)[i] != B_FALSE)
+ return (EINVAL);
+ break;
+ }
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * This function takes a pointer to what should be a nvpair and it's size
+ * and then verifies that all the nvpair fields make sense and can be
+ * trusted. This function is used when decoding packed nvpairs.
+ */
+static int
+i_validate_nvpair(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+ int size1, size2;
+
+ /* verify nvp_name_sz, check the name string length */
+ if (i_validate_nvpair_name(nvp) != 0)
+ return (EFAULT);
+
+ if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
+ return (EFAULT);
+
+ /*
+ * verify nvp_type, nvp_value_elem, and also possibly
+ * verify string values and get the value size.
+ */
+ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
+ size1 = nvp->nvp_size - NVP_VALOFF(nvp);
+ if (size2 < 0 || size1 != NV_ALIGN(size2))
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+ int err;
+
+ if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+ NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Frees all memory allocated for an nvpair (like embedded lists) with
+ * the exception of the nvpair buffer itself.
+ */
+static void
+nvpair_free(nvpair_t *nvp)
+{
+ switch (NVP_TYPE(nvp)) {
+ case DATA_TYPE_NVLIST:
+ nvlist_free(EMBEDDED_NVL(nvp));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ int i;
+
+ for (i = 0; i < NVP_NELEM(nvp); i++)
+ if (nvlp[i] != NULL)
+ nvlist_free(nvlp[i]);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/*
+ * nvlist_free - free an unpacked nvlist
+ */
+void
+nvlist_free(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return;
+
+ /*
+ * Unpacked nvlist are linked through i_nvp_t
+ */
+ curr = priv->nvp_list;
+ while (curr != NULL) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+ curr = curr->nvi_next;
+
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ }
+
+ if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
+ nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
+ else
+ nvl->nvl_priv = 0;
+
+ nv_mem_free(priv, priv, sizeof (nvpriv_t));
+}
+
+static int
+nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+
+ if (nvp == NULL)
+ return (0);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+ if (&curr->nvi_nvp == nvp)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Make a copy of nvlist
+ */
+/*ARGSUSED1*/
+int
+nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xdup(nvl, nvlp,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+ int err;
+ nvlist_t *ret;
+
+ if (nvl == NULL || nvlp == NULL)
+ return (EINVAL);
+
+ if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
+ return (err);
+
+ if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
+ nvlist_free(ret);
+ else
+ *nvlp = ret;
+
+ return (err);
+}
+
+/*
+ * Remove all with matching name
+ */
+int
+nvlist_remove_all(nvlist_t *nvl, const char *name)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+ int error = ENOENT;
+
+ if (nvl == NULL || name == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ curr = priv->nvp_list;
+ while (curr != NULL) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+
+ curr = curr->nvi_next;
+ if (strcmp(name, NVP_NAME(nvp)) != 0)
+ continue;
+
+ nvp_buf_unlink(nvl, nvp);
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+
+ error = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * Remove first one with matching name and type
+ */
+int
+nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL || name == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ curr = priv->nvp_list;
+ while (curr != NULL) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+
+ if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) {
+ nvp_buf_unlink(nvl, nvp);
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+
+ return (0);
+ }
+ curr = curr->nvi_next;
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * This function calculates the size of an nvpair value.
+ *
+ * The data argument controls the behavior in case of the data types
+ * DATA_TYPE_STRING and
+ * DATA_TYPE_STRING_ARRAY
+ * Is data == NULL then the size of the string(s) is excluded.
+ */
+static int
+i_get_value_size(data_type_t type, const void *data, uint_t nelem)
+{
+ uint64_t value_sz;
+
+ if (i_validate_type_nelem(type, nelem) != 0)
+ return (-1);
+
+ /* Calculate required size for holding value */
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ value_sz = 0;
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ value_sz = sizeof (boolean_t);
+ break;
+ case DATA_TYPE_BYTE:
+ value_sz = sizeof (uchar_t);
+ break;
+ case DATA_TYPE_INT8:
+ value_sz = sizeof (int8_t);
+ break;
+ case DATA_TYPE_UINT8:
+ value_sz = sizeof (uint8_t);
+ break;
+ case DATA_TYPE_INT16:
+ value_sz = sizeof (int16_t);
+ break;
+ case DATA_TYPE_UINT16:
+ value_sz = sizeof (uint16_t);
+ break;
+ case DATA_TYPE_INT32:
+ value_sz = sizeof (int32_t);
+ break;
+ case DATA_TYPE_UINT32:
+ value_sz = sizeof (uint32_t);
+ break;
+ case DATA_TYPE_INT64:
+ value_sz = sizeof (int64_t);
+ break;
+ case DATA_TYPE_UINT64:
+ value_sz = sizeof (uint64_t);
+ break;
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ value_sz = 0;
+ else
+ value_sz = strlen(data) + 1;
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (boolean_t);
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uchar_t);
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int8_t);
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint8_t);
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int16_t);
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint16_t);
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int32_t);
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint32_t);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int64_t);
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t);
+
+ if (data != NULL) {
+ char *const *strs = data;
+ uint_t i;
+
+ /* no alignment requirement for strings */
+ for (i = 0; i < nelem; i++) {
+ if (strs[i] == NULL)
+ return (-1);
+ value_sz += strlen(strs[i]) + 1;
+ }
+ }
+ break;
+ case DATA_TYPE_HRTIME:
+ value_sz = sizeof (hrtime_t);
+ break;
+ case DATA_TYPE_NVLIST:
+ value_sz = NV_ALIGN(sizeof (nvlist_t));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t) +
+ (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
+ break;
+ default:
+ return (-1);
+ }
+
+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
+}
+
+static int
+nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
+{
+ nvpriv_t *priv;
+ int err;
+
+ if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
+ nvl->nvl_priv)) == NULL)
+ return (ENOMEM);
+
+ nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
+
+ if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
+ nvlist_free(emb_nvl);
+ emb_nvl->nvl_priv = 0;
+ }
+
+ return (err);
+}
+
+/*
+ * nvlist_add_common - Add new <name,value> pair to nvlist
+ */
+static int
+nvlist_add_common(nvlist_t *nvl, const char *name,
+ data_type_t type, uint_t nelem, const void *data)
+{
+ nvpair_t *nvp;
+ uint_t i;
+
+ int nvp_sz, name_sz, value_sz;
+ int err = 0;
+
+ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ if (nelem != 0 && data == NULL)
+ return (EINVAL);
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) included.
+ */
+ if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
+ return (EINVAL);
+
+ if (i_validate_nvpair_value(type, nelem, data) != 0)
+ return (EINVAL);
+
+ /*
+ * If we're adding an nvlist or nvlist array, ensure that we are not
+ * adding the input nvlist to itself, which would cause recursion,
+ * and ensure that no NULL nvlist pointers are present.
+ */
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ if (data == nvl || data == NULL)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **onvlp = (nvlist_t **)data;
+ for (i = 0; i < nelem; i++) {
+ if (onvlp[i] == nvl || onvlp[i] == NULL)
+ return (EINVAL);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* calculate sizes of the nvpair elements and the nvpair itself */
+ name_sz = strlen(name) + 1;
+
+ nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
+
+ if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
+ return (ENOMEM);
+
+ ASSERT(nvp->nvp_size == nvp_sz);
+ nvp->nvp_name_sz = name_sz;
+ nvp->nvp_value_elem = nelem;
+ nvp->nvp_type = type;
+ bcopy(name, NVP_NAME(nvp), name_sz);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+ case DATA_TYPE_STRING_ARRAY: {
+ char *const *strs = data;
+ char *buf = NVP_VALUE(nvp);
+ char **cstrs = (void *)buf;
+
+ /* skip pre-allocated space for pointer array */
+ buf += nelem * sizeof (uint64_t);
+ for (i = 0; i < nelem; i++) {
+ int slen = strlen(strs[i]) + 1;
+ bcopy(strs[i], buf, slen);
+ cstrs[i] = buf;
+ buf += slen;
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST: {
+ nvlist_t *nnvl = EMBEDDED_NVL(nvp);
+ nvlist_t *onvl = (nvlist_t *)data;
+
+ if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **onvlp = (nvlist_t **)data;
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ nvlist_t *embedded = (nvlist_t *)
+ ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
+
+ for (i = 0; i < nelem; i++) {
+ if ((err = nvlist_copy_embedded(nvl,
+ onvlp[i], embedded)) != 0) {
+ /*
+ * Free any successfully created lists
+ */
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+
+ nvlp[i] = embedded++;
+ }
+ break;
+ }
+ default:
+ bcopy(data, NVP_VALUE(nvp), value_sz);
+ }
+
+ /* if unique name, remove before add */
+ if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
+ (void) nvlist_remove_all(nvl, name);
+ else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
+ (void) nvlist_remove(nvl, name, type);
+
+ nvp_buf_link(nvl, nvp);
+
+ return (0);
+}
+
+int
+nvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
+}
+
+int
+nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
+}
+
+int
+nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
+}
+
+int
+nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
+}
+
+int
+nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
+}
+
+int
+nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
+}
+
+int
+nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
+}
+
+int
+nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
+}
+
+int
+nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
+}
+
+int
+nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
+}
+
+int
+nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
+}
+
+int
+nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
+}
+
+int
+nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_string_array(nvlist_t *nvl, const char *name,
+ char *const *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
+}
+
+int
+nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
+}
+
+int
+nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+/* reading name-value pairs */
+nvpair_t *
+nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ curr = NVPAIR2I_NVP(nvp);
+
+ /*
+ * Ensure that nvp is an valid pointer.
+ */
+ if (nvp == NULL)
+ curr = priv->nvp_list;
+ else if (priv->nvp_curr == curr)
+ curr = curr->nvi_next;
+ else if (nvlist_contains_nvp(nvl, nvp) == 0)
+ curr = NULL;
+
+ priv->nvp_curr = curr;
+
+ return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+char *
+nvpair_name(nvpair_t *nvp)
+{
+ return (NVP_NAME(nvp));
+}
+
+data_type_t
+nvpair_type(nvpair_t *nvp)
+{
+ return (NVP_TYPE(nvp));
+}
+
+static int
+nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
+{
+ if (nvp == NULL || nvpair_type(nvp) != type)
+ return (EINVAL);
+
+ /*
+ * For non-array types, we copy the data.
+ * For array types (including string), we set a pointer.
+ */
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ if (nelem != NULL)
+ *nelem = 0;
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+ if (data == NULL)
+ return (EINVAL);
+ bcopy(NVP_VALUE(nvp), data,
+ (size_t)i_get_value_size(type, NULL, 1));
+ if (nelem != NULL)
+ *nelem = 1;
+ break;
+
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ return (EINVAL);
+ *(void **)data = (void *)NVP_VALUE(nvp);
+ if (nelem != NULL)
+ *nelem = 1;
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nelem == NULL || data == NULL)
+ return (EINVAL);
+ if ((*nelem = NVP_NELEM(nvp)) != 0)
+ *(void **)data = (void *)NVP_VALUE(nvp);
+ else
+ *(void **)data = NULL;
+ break;
+
+ default:
+ return (ENOTSUP);
+ }
+
+ return (0);
+}
+
+static int
+nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint_t *nelem, void *data)
+{
+ nvpriv_t *priv;
+ nvpair_t *nvp;
+ i_nvp_t *curr;
+
+ if (name == NULL || nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
+ return (ENOTSUP);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvp = &curr->nvi_nvp;
+
+ if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type)
+ return (nvpair_value_common(nvp, type, nelem, data));
+ }
+
+ return (ENOENT);
+}
+
+int
+nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
+}
+
+int
+nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
+{
+ return (nvlist_lookup_common(nvl, name,
+ DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
+}
+
+int
+nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name,
+ DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
+ uchar_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
+ uint8_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
+ int16_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
+ uint16_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
+ int32_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
+ uint32_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
+ int64_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
+ uint64_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
+ char ***a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
+ nvlist_t ***a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
+}
+
+int
+nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
+{
+ va_list ap;
+ char *name;
+ int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
+ int ret = 0;
+
+ va_start(ap, flag);
+ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+ data_type_t type;
+ void *val;
+ uint_t *nelem;
+
+ switch (type = va_arg(ap, data_type_t)) {
+ case DATA_TYPE_BOOLEAN:
+ ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_NVLIST:
+ val = va_arg(ap, void *);
+ ret = nvlist_lookup_common(nvl, name, type, NULL, val);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ val = va_arg(ap, void *);
+ nelem = va_arg(ap, uint_t *);
+ ret = nvlist_lookup_common(nvl, name, type, nelem, val);
+ break;
+
+ default:
+ ret = EINVAL;
+ }
+
+ if (ret == ENOENT && noentok)
+ ret = 0;
+ }
+ va_end(ap);
+
+ return (ret);
+}
+
+int
+nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvpair_value_int8(nvpair_t *nvp, int8_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvpair_value_int16(nvpair_t *nvp, int16_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvpair_value_int32(nvpair_t *nvp, int32_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvpair_value_int64(nvpair_t *nvp, int64_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
+}
+
+int
+nvpair_value_string(nvpair_t *nvp, char **val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
+}
+
+/*
+ * Add specified pair to the list.
+ */
+int
+nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ if (nvl == NULL || nvp == NULL)
+ return (EINVAL);
+
+ return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+ NVP_NELEM(nvp), NVP_VALUE(nvp)));
+}
+
+/*
+ * Merge the supplied nvlists and put the result in dst.
+ * The merged list will contain all names specified in both lists,
+ * the values are taken from nvl in the case of duplicates.
+ * Return 0 on success.
+ */
+/*ARGSUSED*/
+int
+nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
+{
+ if (nvl == NULL || dst == NULL)
+ return (EINVAL);
+
+ if (dst != nvl)
+ return (nvlist_copy_pairs(nvl, dst));
+
+ return (0);
+}
+
+/*
+ * Encoding related routines
+ */
+#define NVS_OP_ENCODE 0
+#define NVS_OP_DECODE 1
+#define NVS_OP_GETSIZE 2
+
+typedef struct nvs_ops nvs_ops_t;
+
+typedef struct {
+ int nvs_op;
+ const nvs_ops_t *nvs_ops;
+ void *nvs_private;
+ nvpriv_t *nvs_priv;
+} nvstream_t;
+
+/*
+ * nvs operations are:
+ * - nvs_nvlist
+ * encoding / decoding of a nvlist header (nvlist_t)
+ * calculates the size used for header and end detection
+ *
+ * - nvs_nvpair
+ * responsible for the first part of encoding / decoding of an nvpair
+ * calculates the decoded size of an nvpair
+ *
+ * - nvs_nvp_op
+ * second part of encoding / decoding of an nvpair
+ *
+ * - nvs_nvp_size
+ * calculates the encoding size of an nvpair
+ *
+ * - nvs_nvl_fini
+ * encodes the end detection mark (zeros).
+ */
+struct nvs_ops {
+ int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
+ int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
+ int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
+ int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
+ int (*nvs_nvl_fini)(nvstream_t *);
+};
+
+typedef struct {
+ char nvh_encoding; /* nvs encoding method */
+ char nvh_endian; /* nvs endian */
+ char nvh_reserved1; /* reserved for future use */
+ char nvh_reserved2; /* reserved for future use */
+} nvs_header_t;
+
+static int
+nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+
+ /*
+ * Walk nvpair in list and encode each nvpair
+ */
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+ if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
+ return (EFAULT);
+
+ return (nvs->nvs_ops->nvs_nvl_fini(nvs));
+}
+
+static int
+nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+ nvpair_t *nvp;
+ size_t nvsize;
+ int err;
+
+ /*
+ * Get decoded size of next pair in stream, alloc
+ * memory for nvpair_t, then decode the nvpair
+ */
+ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
+ if (nvsize == 0) /* end of list */
+ break;
+
+ /* make sure len makes sense */
+ if (nvsize < NVP_SIZE_CALC(1, 0))
+ return (EFAULT);
+
+ if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
+ return (ENOMEM);
+
+ if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+
+ if (i_validate_nvpair(nvp) != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (EFAULT);
+ }
+
+ nvp_buf_link(nvl, nvp);
+ }
+ return (err);
+}
+
+static int
+nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+ uint64_t nvsize = *buflen;
+ size_t size;
+
+ /*
+ * Get encoded size of nvpairs in nvlist
+ */
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
+ return (EINVAL);
+
+ if ((nvsize += size) > INT32_MAX)
+ return (EINVAL);
+ }
+
+ *buflen = nvsize;
+ return (0);
+}
+
+static int
+nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+ int err;
+
+ if (nvl->nvl_priv == 0)
+ return (EFAULT);
+
+ /*
+ * Perform the operation, starting with header, then each nvpair
+ */
+ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
+ return (err);
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ err = nvs_encode_pairs(nvs, nvl);
+ break;
+
+ case NVS_OP_DECODE:
+ err = nvs_decode_pairs(nvs, nvl);
+ break;
+
+ case NVS_OP_GETSIZE:
+ err = nvs_getsize_pairs(nvs, nvl, buflen);
+ break;
+
+ default:
+ err = EINVAL;
+ }
+
+ return (err);
+}
+
+static int
+nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ return (nvs_operation(nvs, embedded, NULL));
+
+ case NVS_OP_DECODE: {
+ nvpriv_t *priv;
+ int err;
+
+ if (embedded->nvl_version != NV_VERSION)
+ return (ENOTSUP);
+
+ if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
+ return (ENOMEM);
+
+ nvlist_init(embedded, embedded->nvl_nvflag, priv);
+
+ if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
+ nvlist_free(embedded);
+ return (err);
+ }
+ default:
+ break;
+ }
+
+ return (EINVAL);
+}
+
+static int
+nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ size_t nelem = NVP_NELEM(nvp);
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ int i;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ for (i = 0; i < nelem; i++)
+ if (nvs_embedded(nvs, nvlp[i]) != 0)
+ return (EFAULT);
+ break;
+
+ case NVS_OP_DECODE: {
+ size_t len = nelem * sizeof (uint64_t);
+ nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
+
+ bzero(nvlp, len); /* don't trust packed data */
+ for (i = 0; i < nelem; i++) {
+ if (nvs_embedded(nvs, embedded) != 0) {
+ nvpair_free(nvp);
+ return (EFAULT);
+ }
+
+ nvlp[i] = embedded++;
+ }
+ break;
+ }
+ case NVS_OP_GETSIZE: {
+ uint64_t nvsize = 0;
+
+ for (i = 0; i < nelem; i++) {
+ size_t nvp_sz = 0;
+
+ if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
+ return (EINVAL);
+
+ if ((nvsize += nvp_sz) > INT32_MAX)
+ return (EINVAL);
+ }
+
+ *size = nvsize;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
+static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
+
+/*
+ * Common routine for nvlist operations:
+ * encode, decode, getsize (encoded size).
+ */
+static int
+nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
+ int nvs_op)
+{
+ int err = 0;
+ nvstream_t nvs;
+ int nvl_endian;
+#ifdef _LITTLE_ENDIAN
+ int host_endian = 1;
+#else
+ int host_endian = 0;
+#endif /* _LITTLE_ENDIAN */
+ nvs_header_t *nvh = (void *)buf;
+
+ if (buflen == NULL || nvl == NULL ||
+ (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ nvs.nvs_op = nvs_op;
+
+ /*
+ * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
+ * a buffer is allocated. The first 4 bytes in the buffer are
+ * used for encoding method and host endian.
+ */
+ switch (nvs_op) {
+ case NVS_OP_ENCODE:
+ if (buf == NULL || *buflen < sizeof (nvs_header_t))
+ return (EINVAL);
+
+ nvh->nvh_encoding = encoding;
+ nvh->nvh_endian = nvl_endian = host_endian;
+ nvh->nvh_reserved1 = 0;
+ nvh->nvh_reserved2 = 0;
+ break;
+
+ case NVS_OP_DECODE:
+ if (buf == NULL || *buflen < sizeof (nvs_header_t))
+ return (EINVAL);
+
+ /* get method of encoding from first byte */
+ encoding = nvh->nvh_encoding;
+ nvl_endian = nvh->nvh_endian;
+ break;
+
+ case NVS_OP_GETSIZE:
+ nvl_endian = host_endian;
+
+ /*
+ * add the size for encoding
+ */
+ *buflen = sizeof (nvs_header_t);
+ break;
+
+ default:
+ return (ENOTSUP);
+ }
+
+ /*
+ * Create an nvstream with proper encoding method
+ */
+ switch (encoding) {
+ case NV_ENCODE_NATIVE:
+ /*
+ * check endianness, in case we are unpacking
+ * from a file
+ */
+ if (nvl_endian != host_endian)
+ return (ENOTSUP);
+ err = nvs_native(&nvs, nvl, buf, buflen);
+ break;
+ case NV_ENCODE_XDR:
+ err = nvs_xdr(&nvs, nvl, buf, buflen);
+ break;
+ default:
+ err = ENOTSUP;
+ break;
+ }
+
+ return (err);
+}
+
+int
+nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
+{
+ return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
+}
+
+/*
+ * Pack nvlist into contiguous memory
+ */
+/*ARGSUSED1*/
+int
+nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+ int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xpack(nvl, bufp, buflen, encoding,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+ nv_alloc_t *nva)
+{
+ nvpriv_t nvpriv;
+ size_t alloc_size;
+ char *buf;
+ int err;
+
+ if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
+ return (EINVAL);
+
+ if (*bufp != NULL)
+ return (nvlist_common(nvl, *bufp, buflen, encoding,
+ NVS_OP_ENCODE));
+
+ /*
+ * Here is a difficult situation:
+ * 1. The nvlist has fixed allocator properties.
+ * All other nvlist routines (like nvlist_add_*, ...) use
+ * these properties.
+ * 2. When using nvlist_pack() the user can specify his own
+ * allocator properties (e.g. by using KM_NOSLEEP).
+ *
+ * We use the user specified properties (2). A clearer solution
+ * will be to remove the kmflag from nvlist_pack(), but we will
+ * not change the interface.
+ */
+ nv_priv_init(&nvpriv, nva, 0);
+
+ if (err = nvlist_size(nvl, &alloc_size, encoding))
+ return (err);
+
+ if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
+ return (ENOMEM);
+
+ if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
+ NVS_OP_ENCODE)) != 0) {
+ nv_mem_free(&nvpriv, buf, alloc_size);
+ } else {
+ *buflen = alloc_size;
+ *bufp = buf;
+ }
+
+ return (err);
+}
+
+/*
+ * Unpack buf into an nvlist_t
+ */
+/*ARGSUSED1*/
+int
+nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xunpack(buf, buflen, nvlp,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+ nvlist_t *nvl;
+ int err;
+
+ if (nvlp == NULL)
+ return (EINVAL);
+
+ if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
+ return (err);
+
+ if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0)
+ nvlist_free(nvl);
+ else
+ *nvlp = nvl;
+
+ return (err);
+}
+
+/*
+ * Native encoding functions
+ */
+typedef struct {
+ /*
+ * This structure is used when decoding a packed nvpair in
+ * the native format. n_base points to a buffer containing the
+ * packed nvpair. n_end is a pointer to the end of the buffer.
+ * (n_end actually points to the first byte past the end of the
+ * buffer.) n_curr is a pointer that lies between n_base and n_end.
+ * It points to the current data that we are decoding.
+ * The amount of data left in the buffer is equal to n_end - n_curr.
+ * n_flag is used to recognize a packed embedded list.
+ */
+ caddr_t n_base;
+ caddr_t n_end;
+ caddr_t n_curr;
+ uint_t n_flag;
+} nvs_native_t;
+
+static int
+nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
+ size_t buflen)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ nvs->nvs_private = native;
+ native->n_curr = native->n_base = buf;
+ native->n_end = buf + buflen;
+ native->n_flag = 0;
+ return (0);
+
+ case NVS_OP_GETSIZE:
+ nvs->nvs_private = native;
+ native->n_curr = native->n_base = native->n_end = NULL;
+ native->n_flag = 0;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/*ARGSUSED*/
+static void
+nvs_native_destroy(nvstream_t *nvs)
+{
+}
+
+static int
+native_cp(nvstream_t *nvs, void *buf, size_t size)
+{
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+
+ if (native->n_curr + size > native->n_end)
+ return (EFAULT);
+
+ /*
+ * The bcopy() below eliminates alignment requirement
+ * on the buffer (stream) and is preferred over direct access.
+ */
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ bcopy(buf, native->n_curr, size);
+ break;
+ case NVS_OP_DECODE:
+ bcopy(native->n_curr, buf, size);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ native->n_curr += size;
+ return (0);
+}
+
+/*
+ * operate on nvlist_t header
+ */
+static int
+nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+ nvs_native_t *native = nvs->nvs_private;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ if (native->n_flag)
+ return (0); /* packed embedded list */
+
+ native->n_flag = 1;
+
+ /* copy version and nvflag of the nvlist_t */
+ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
+ native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
+ return (EFAULT);
+
+ return (0);
+
+ case NVS_OP_GETSIZE:
+ /*
+ * if calculate for packed embedded list
+ * 4 for end of the embedded list
+ * else
+ * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag
+ * and 4 for end of the entire list
+ */
+ if (native->n_flag) {
+ *size += 4;
+ } else {
+ native->n_flag = 1;
+ *size += 2 * sizeof (int32_t) + 4;
+ }
+
+ return (0);
+
+ default:
+ return (EINVAL);
+ }
+}
+
+static int
+nvs_native_nvl_fini(nvstream_t *nvs)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ /*
+ * Add 4 zero bytes at end of nvlist. They are used
+ * for end detection by the decode routine.
+ */
+ if (native->n_curr + sizeof (int) > native->n_end)
+ return (EFAULT);
+
+ bzero(native->n_curr, sizeof (int));
+ native->n_curr += sizeof (int);
+ }
+
+ return (0);
+}
+
+static int
+nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ nvlist_t *packed = (void *)
+ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+ /*
+ * Null out the pointer that is meaningless in the packed
+ * structure. The address may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(&packed->nvl_priv, sizeof (packed->nvl_priv));
+ }
+
+ return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
+}
+
+static int
+nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
+ size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
+ nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len);
+ int i;
+ /*
+ * Null out pointers that are meaningless in the packed
+ * structure. The addresses may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(value, len);
+
+ for (i = 0; i < NVP_NELEM(nvp); i++, packed++)
+ /*
+ * Null out the pointer that is meaningless in the
+ * packed structure. The address may not be aligned,
+ * so we have to use bzero.
+ */
+ bzero(&packed->nvl_priv, sizeof (packed->nvl_priv));
+ }
+
+ return (nvs_embedded_nvl_array(nvs, nvp, NULL));
+}
+
+static void
+nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ uint64_t *strp = (void *)
+ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+ /*
+ * Null out pointers that are meaningless in the packed
+ * structure. The addresses may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
+ break;
+ }
+ case NVS_OP_DECODE: {
+ char **strp = (void *)NVP_VALUE(nvp);
+ char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
+ int i;
+
+ for (i = 0; i < NVP_NELEM(nvp); i++) {
+ strp[i] = buf;
+ buf += strlen(buf) + 1;
+ }
+ break;
+ }
+ }
+}
+
+static int
+nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+ data_type_t type;
+ int value_sz;
+ int ret = 0;
+
+ /*
+ * We do the initial bcopy of the data before we look at
+ * the nvpair type, because when we're decoding, we won't
+ * have the correct values for the pair until we do the bcopy.
+ */
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
+ return (EFAULT);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /* verify nvp_name_sz, check the name string length */
+ if (i_validate_nvpair_name(nvp) != 0)
+ return (EFAULT);
+
+ type = NVP_TYPE(nvp);
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) excluded.
+ */
+ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
+ return (EFAULT);
+
+ if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
+ return (EFAULT);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ ret = nvpair_native_embedded(nvs, nvp);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ ret = nvpair_native_embedded_array(nvs, nvp);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nvpair_native_string_array(nvs, nvp);
+ break;
+ default:
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ uint64_t nvp_sz = nvp->nvp_size;
+
+ switch (NVP_TYPE(nvp)) {
+ case DATA_TYPE_NVLIST: {
+ size_t nvsize = 0;
+
+ if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t nvsize;
+
+ if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (nvp_sz > INT32_MAX)
+ return (EINVAL);
+
+ *size = nvp_sz;
+
+ return (0);
+}
+
+static int
+nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ return (nvs_native_nvp_op(nvs, nvp));
+
+ case NVS_OP_DECODE: {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ int32_t decode_len;
+
+ /* try to read the size value from the stream */
+ if (native->n_curr + sizeof (int32_t) > native->n_end)
+ return (EFAULT);
+ bcopy(native->n_curr, &decode_len, sizeof (int32_t));
+
+ /* sanity check the size value */
+ if (decode_len < 0 ||
+ decode_len > native->n_end - native->n_curr)
+ return (EFAULT);
+
+ *size = decode_len;
+
+ /*
+ * If at the end of the stream then move the cursor
+ * forward, otherwise nvpair_native_op() will read
+ * the entire nvpair at the same cursor position.
+ */
+ if (*size == 0)
+ native->n_curr += sizeof (int32_t);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static const nvs_ops_t nvs_native_ops = {
+ nvs_native_nvlist,
+ nvs_native_nvpair,
+ nvs_native_nvp_op,
+ nvs_native_nvp_size,
+ nvs_native_nvl_fini
+};
+
+static int
+nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+ nvs_native_t native;
+ int err;
+
+ nvs->nvs_ops = &nvs_native_ops;
+
+ if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
+ *buflen - sizeof (nvs_header_t))) != 0)
+ return (err);
+
+ err = nvs_operation(nvs, nvl, buflen);
+
+ nvs_native_destroy(nvs);
+
+ return (err);
+}
+
+/*
+ * XDR encoding functions
+ *
+ * An xdr packed nvlist is encoded as:
+ *
+ * - encoding methode and host endian (4 bytes)
+ * - nvl_version (4 bytes)
+ * - nvl_nvflag (4 bytes)
+ *
+ * - encoded nvpairs, the format of one xdr encoded nvpair is:
+ * - encoded size of the nvpair (4 bytes)
+ * - decoded size of the nvpair (4 bytes)
+ * - name string, (4 + sizeof(NV_ALIGN4(string))
+ * a string is coded as size (4 bytes) and data
+ * - data type (4 bytes)
+ * - number of elements in the nvpair (4 bytes)
+ * - data
+ *
+ * - 2 zero's for end of the entire list (8 bytes)
+ */
+static int
+nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
+{
+ /* xdr data must be 4 byte aligned */
+ if ((ulong_t)buf % 4 != 0)
+ return (EFAULT);
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
+ nvs->nvs_private = xdr;
+ return (0);
+ case NVS_OP_DECODE:
+ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
+ nvs->nvs_private = xdr;
+ return (0);
+ case NVS_OP_GETSIZE:
+ nvs->nvs_private = NULL;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+static void
+nvs_xdr_destroy(nvstream_t *nvs)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ xdr_destroy((XDR *)nvs->nvs_private);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE: {
+ XDR *xdr = nvs->nvs_private;
+
+ if (!xdr_int(xdr, &nvl->nvl_version) ||
+ !xdr_u_int(xdr, &nvl->nvl_nvflag))
+ return (EFAULT);
+ break;
+ }
+ case NVS_OP_GETSIZE: {
+ /*
+ * 2 * 4 for nvl_version + nvl_nvflag
+ * and 8 for end of the entire list
+ */
+ *size += 2 * 4 + 8;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+nvs_xdr_nvl_fini(nvstream_t *nvs)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ XDR *xdr = nvs->nvs_private;
+ int zero = 0;
+
+ if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+/*
+ * The format of xdr encoded nvpair is:
+ * encode_size, decode_size, name string, data type, nelem, data
+ */
+static int
+nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+ data_type_t type;
+ char *buf;
+ char *buf_end = (char *)nvp + nvp->nvp_size;
+ int value_sz;
+ uint_t nelem, buflen;
+ bool_t ret = FALSE;
+ XDR *xdr = nvs->nvs_private;
+
+ ASSERT(xdr != NULL && nvp != NULL);
+
+ /* name string */
+ if ((buf = NVP_NAME(nvp)) >= buf_end)
+ return (EFAULT);
+ buflen = buf_end - buf;
+
+ if (!xdr_string(xdr, &buf, buflen - 1))
+ return (EFAULT);
+ nvp->nvp_name_sz = strlen(buf) + 1;
+
+ /* type and nelem */
+ if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
+ !xdr_int(xdr, &nvp->nvp_value_elem))
+ return (EFAULT);
+
+ type = NVP_TYPE(nvp);
+ nelem = nvp->nvp_value_elem;
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) excluded.
+ */
+ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
+ return (EFAULT);
+
+ /* if there is no data to extract then return */
+ if (nelem == 0)
+ return (0);
+
+ /* value */
+ if ((buf = NVP_VALUE(nvp)) >= buf_end)
+ return (EFAULT);
+ buflen = buf_end - buf;
+
+ if (buflen < value_sz)
+ return (EFAULT);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ if (nvs_embedded(nvs, (void *)buf) == 0)
+ return (0);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
+ return (0);
+ break;
+
+ case DATA_TYPE_BOOLEAN:
+ ret = TRUE;
+ break;
+
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ ret = xdr_char(xdr, buf);
+ break;
+
+ case DATA_TYPE_INT16:
+ ret = xdr_short(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT16:
+ ret = xdr_u_short(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_INT32:
+ ret = xdr_int(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT32:
+ ret = xdr_u_int(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_INT64:
+ ret = xdr_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT64:
+ ret = xdr_u_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ /*
+ * NOTE: must expose the definition of hrtime_t here
+ */
+ ret = xdr_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_STRING:
+ ret = xdr_string(xdr, &buf, buflen - 1);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ ret = xdr_opaque(xdr, buf, nelem);
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
+ (xdrproc_t)xdr_char);
+ break;
+
+ case DATA_TYPE_INT16_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
+ sizeof (int16_t), (xdrproc_t)xdr_short);
+ break;
+
+ case DATA_TYPE_UINT16_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
+ sizeof (uint16_t), (xdrproc_t)xdr_u_short);
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
+ sizeof (int32_t), (xdrproc_t)xdr_int);
+ break;
+
+ case DATA_TYPE_UINT32_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
+ sizeof (uint32_t), (xdrproc_t)xdr_u_int);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
+ sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
+ break;
+
+ case DATA_TYPE_UINT64_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
+ sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY: {
+ size_t len = nelem * sizeof (uint64_t);
+ char **strp = (void *)buf;
+ int i;
+
+ if (nvs->nvs_op == NVS_OP_DECODE)
+ bzero(buf, len); /* don't trust packed data */
+
+ for (i = 0; i < nelem; i++) {
+ if (buflen <= len)
+ return (EFAULT);
+
+ buf += len;
+ buflen -= len;
+
+ if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
+ return (EFAULT);
+
+ if (nvs->nvs_op == NVS_OP_DECODE)
+ strp[i] = buf;
+ len = strlen(buf) + 1;
+ }
+ ret = TRUE;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return (ret == TRUE ? 0 : EFAULT);
+}
+
+static int
+nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ data_type_t type = NVP_TYPE(nvp);
+ /*
+ * encode_size + decode_size + name string size + data type + nelem
+ * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
+ */
+ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ nvp_sz += 4; /* 4 is the minimum xdr unit */
+ break;
+
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+ nvp_sz += 8;
+ break;
+
+ case DATA_TYPE_STRING:
+ nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY: {
+ int i;
+ char **strs = (void *)NVP_VALUE(nvp);
+
+ for (i = 0; i < NVP_NELEM(nvp); i++)
+ nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
+
+ break;
+ }
+
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t nvsize = 0;
+ int old_nvs_op = nvs->nvs_op;
+ int err;
+
+ nvs->nvs_op = NVS_OP_GETSIZE;
+ if (type == DATA_TYPE_NVLIST)
+ err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
+ else
+ err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
+ nvs->nvs_op = old_nvs_op;
+
+ if (err != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ if (nvp_sz > INT32_MAX)
+ return (EINVAL);
+
+ *size = nvp_sz;
+
+ return (0);
+}
+
+
+/*
+ * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
+ * the largest nvpair that could be encoded in the buffer.
+ *
+ * See comments above nvpair_xdr_op() for the format of xdr encoding.
+ * The size of a xdr packed nvpair without any data is 5 words.
+ *
+ * Using the size of the data directly as an estimate would be ok
+ * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY
+ * then the actual nvpair has space for an array of pointers to index
+ * the strings. These pointers are not encoded into the packed xdr buffer.
+ *
+ * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
+ * of length 0, then each string is endcoded in xdr format as a single word.
+ * Therefore when expanded to an nvpair there will be 2.25 word used for
+ * each string. (a int64_t allocated for pointer usage, and a single char
+ * for the null termination.)
+ *
+ * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
+ */
+#define NVS_XDR_HDR_LEN ((size_t)(5 * 4))
+#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
+ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
+#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \
+ (NVS_XDR_DATA_LEN(x) * 2) + \
+ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
+
+static int
+nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ XDR *xdr = nvs->nvs_private;
+ int32_t encode_len, decode_len;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ size_t nvsize;
+
+ if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
+ return (EFAULT);
+
+ decode_len = nvp->nvp_size;
+ encode_len = nvsize;
+ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+ return (EFAULT);
+
+ return (nvs_xdr_nvp_op(nvs, nvp));
+ }
+ case NVS_OP_DECODE: {
+ struct xdr_bytesrec bytesrec;
+
+ /* get the encode and decode size */
+ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+ return (EFAULT);
+ *size = decode_len;
+
+ /* are we at the end of the stream? */
+ if (*size == 0)
+ return (0);
+
+ /* sanity check the size parameter */
+ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
+ return (EFAULT);
+
+ if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
+ return (EFAULT);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static const struct nvs_ops nvs_xdr_ops = {
+ nvs_xdr_nvlist,
+ nvs_xdr_nvpair,
+ nvs_xdr_nvp_op,
+ nvs_xdr_nvp_size,
+ nvs_xdr_nvl_fini
+};
+
+static int
+nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+ XDR xdr;
+ int err;
+
+ nvs->nvs_ops = &nvs_xdr_ops;
+
+ if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
+ *buflen - sizeof (nvs_header_t))) != 0)
+ return (err);
+
+ err = nvs_operation(nvs, nvl, buflen);
+
+ nvs_xdr_destroy(nvs);
+
+ return (err);
+}
diff --git a/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c b/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c
new file mode 100644
index 000000000000..620171e4ca4e
--- /dev/null
+++ b/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#if defined(_KERNEL) && !defined(_BOOT)
+#include <sys/varargs.h>
+#else
+#include <stdarg.h>
+#include <strings.h>
+#endif
+
+/*
+ * This allocator is very simple.
+ * - it uses a pre-allocated buffer for memory allocations.
+ * - it does _not_ free memory in the pre-allocated buffer.
+ *
+ * The reason for the selected implemention is simplicity.
+ * This allocator is designed for the usage in interrupt context when
+ * the caller may not wait for free memory.
+ */
+
+/* pre-allocated buffer for memory allocations */
+typedef struct nvbuf {
+ uintptr_t nvb_buf; /* address of pre-allocated buffer */
+ uintptr_t nvb_lim; /* limit address in the buffer */
+ uintptr_t nvb_cur; /* current address in the buffer */
+} nvbuf_t;
+
+/*
+ * Initialize the pre-allocated buffer allocator. The caller needs to supply
+ *
+ * buf address of pre-allocated buffer
+ * bufsz size of pre-allocated buffer
+ *
+ * nv_fixed_init() calculates the remaining members of nvbuf_t.
+ */
+static int
+nv_fixed_init(nv_alloc_t *nva, va_list valist)
+{
+ uintptr_t base = va_arg(valist, uintptr_t);
+ uintptr_t lim = base + va_arg(valist, size_t);
+ nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t));
+
+ if (base == 0 || (uintptr_t)&nvb[1] > lim)
+ return (EINVAL);
+
+ nvb->nvb_buf = (uintptr_t)&nvb[0];
+ nvb->nvb_cur = (uintptr_t)&nvb[1];
+ nvb->nvb_lim = lim;
+ nva->nva_arg = nvb;
+
+ return (0);
+}
+
+static void *
+nv_fixed_alloc(nv_alloc_t *nva, size_t size)
+{
+ nvbuf_t *nvb = nva->nva_arg;
+ uintptr_t new = nvb->nvb_cur;
+
+ if (size == 0 || new + size > nvb->nvb_lim)
+ return (NULL);
+
+ nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
+
+ return ((void *)new);
+}
+
+/*ARGSUSED*/
+static void
+nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ /* don't free memory in the pre-allocated buffer */
+}
+
+static void
+nv_fixed_reset(nv_alloc_t *nva)
+{
+ nvbuf_t *nvb = nva->nva_arg;
+
+ nvb->nvb_cur = (uintptr_t)&nvb[1];
+}
+
+const nv_alloc_ops_t nv_fixed_ops_def = {
+ nv_fixed_init, /* nv_ao_init() */
+ NULL, /* nv_ao_fini() */
+ nv_fixed_alloc, /* nv_ao_alloc() */
+ nv_fixed_free, /* nv_ao_free() */
+ nv_fixed_reset /* nv_ao_reset() */
+};
+
+const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c
new file mode 100644
index 000000000000..2004d860d329
--- /dev/null
+++ b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Common name validation routines for ZFS. These routines are shared by the
+ * userland code as well as the ioctl() layer to ensure that we don't
+ * inadvertently expose a hole through direct ioctl()s that never gets tested.
+ * In userland, however, we want significantly more information about _why_ the
+ * name is invalid. In the kernel, we only care whether it's valid or not.
+ * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
+ * the name failed to validate.
+ *
+ * Each function returns 0 on success, -1 on error.
+ */
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/param.h>
+#include "zfs_namecheck.h"
+
+static int
+valid_char(char c)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '_' || c == '.' || c == ':');
+}
+
+/*
+ * Snapshot names must be made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * [-_.:]
+ */
+int
+snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ const char *loc;
+
+ if (strlen(path) >= MAXNAMELEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] == '\0') {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ for (loc = path; *loc; loc++) {
+ if (!valid_char(*loc)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *loc;
+ }
+ return (-1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Dataset names must be of the following form:
+ *
+ * [component][/]*[component][@component]
+ *
+ * Where each component is made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * [-_.:]
+ */
+int
+dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ const char *loc, *end;
+ int found_snapshot;
+
+ /*
+ * Make sure the name is not too long.
+ *
+ * ZFS_MAXNAMELEN is the maximum dataset length used in the userland
+ * which is the same as MAXNAMELEN used in the kernel.
+ * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all
+ * places using MAXNAMELEN.
+ */
+ if (strlen(path) >= MAXNAMELEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ /* Explicitly check for a leading slash. */
+ if (path[0] == '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ if (path[0] == '\0') {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ loc = path;
+ found_snapshot = 0;
+ for (;;) {
+ /* Find the end of this component */
+ end = loc;
+ while (*end != '/' && *end != '@' && *end != '\0')
+ end++;
+
+ if (*end == '\0' && end[-1] == '/') {
+ /* trailing slashes are not allowed */
+ if (why)
+ *why = NAME_ERR_TRAILING_SLASH;
+ return (-1);
+ }
+
+ /* Zero-length components are not allowed */
+ if (loc == end) {
+ if (why) {
+ /*
+ * Make sure this is really a zero-length
+ * component and not a '@@'.
+ */
+ if (*end == '@' && found_snapshot) {
+ *why = NAME_ERR_MULTIPLE_AT;
+ } else {
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ }
+ }
+
+ return (-1);
+ }
+
+ /* Validate the contents of this component */
+ while (loc != end) {
+ if (!valid_char(*loc)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *loc;
+ }
+ return (-1);
+ }
+ loc++;
+ }
+
+ /* If we've reached the end of the string, we're OK */
+ if (*end == '\0')
+ return (0);
+
+ if (*end == '@') {
+ /*
+ * If we've found an @ symbol, indicate that we're in
+ * the snapshot component, and report a second '@'
+ * character as an error.
+ */
+ if (found_snapshot) {
+ if (why)
+ *why = NAME_ERR_MULTIPLE_AT;
+ return (-1);
+ }
+
+ found_snapshot = 1;
+ }
+
+ /*
+ * If there is a '/' in a snapshot name
+ * then report an error
+ */
+ if (*end == '/' && found_snapshot) {
+ if (why)
+ *why = NAME_ERR_TRAILING_SLASH;
+ return (-1);
+ }
+
+ /* Update to the next component */
+ loc = end + 1;
+ }
+}
+
+/*
+ * For pool names, we have the same set of valid characters as described in
+ * dataset names, with the additional restriction that the pool name must begin
+ * with a letter. The pool names 'raidz' and 'mirror' are also reserved names
+ * that cannot be used.
+ */
+int
+pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
+{
+ const char *c;
+
+ /*
+ * Make sure the name is not too long.
+ *
+ * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland
+ * which is the same as MAXNAMELEN used in the kernel.
+ * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all
+ * places using MAXNAMELEN.
+ */
+ if (strlen(pool) >= MAXNAMELEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ c = pool;
+ while (*c != '\0') {
+ if (!valid_char(*c)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *c;
+ }
+ return (-1);
+ }
+ c++;
+ }
+
+ if (!(*pool >= 'a' && *pool <= 'z') &&
+ !(*pool >= 'A' && *pool <= 'Z')) {
+ if (why)
+ *why = NAME_ERR_NOLETTER;
+ return (-1);
+ }
+
+ if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
+ if (why)
+ *why = NAME_ERR_RESERVED;
+ return (-1);
+ }
+
+ if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
+ if (why)
+ *why = NAME_ERR_DISKLIKE;
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Check if the dataset name is private for internal usage.
+ * '$' is reserved for internal dataset names. e.g. "$MOS"
+ *
+ * Return 1 if the given name is used internally.
+ * Return 0 if it is not.
+ */
+int
+dataset_name_hidden(const char *name)
+{
+ if (strchr(name, '$') != NULL)
+ return (1);
+
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h
new file mode 100644
index 000000000000..7e0cda974cc6
--- /dev/null
+++ b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_NAMECHECK_H
+#define _ZFS_NAMECHECK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ NAME_ERR_LEADING_SLASH, /* name begins with leading slash */
+ NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */
+ NAME_ERR_TRAILING_SLASH, /* name ends with a slash */
+ NAME_ERR_INVALCHAR, /* invalid character found */
+ NAME_ERR_MULTIPLE_AT, /* multiple '@' characters found */
+ NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */
+ NAME_ERR_RESERVED, /* entire name is reserved */
+ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */
+ NAME_ERR_TOOLONG, /* name is too long */
+} namecheck_err_t;
+
+int pool_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_name_hidden(const char *);
+int snapshot_namecheck(const char *, namecheck_err_t *, char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_NAMECHECK_H */
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/contrib/opensolaris/common/zfs/zfs_prop.c
new file mode 100644
index 000000000000..71256192c092
--- /dev/null
+++ b/sys/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -0,0 +1,657 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Master property table.
+ *
+ * This table keeps track of all the properties supported by ZFS, and their
+ * various attributes. Not all of these are needed by the kernel, and several
+ * are only used by a single libzfs client. But having them here centralizes
+ * all property information in one location.
+ *
+ * name The human-readable string representing this property
+ * proptype Basic type (string, boolean, number)
+ * default Default value for the property. Sadly, C only allows
+ * you to initialize the first member of a union, so we
+ * have two default members for each property.
+ * attr Attributes (readonly, inheritable) for the property
+ * types Valid dataset types to which this applies
+ * values String describing acceptable values for the property
+ * colname The column header for 'zfs list'
+ * colfmt The column formatting for 'zfs list'
+ *
+ * This table must match the order of property types in libzfs.h.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+
+#include "zfs_prop.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+typedef enum {
+ prop_default,
+ prop_readonly,
+ prop_inherit
+} prop_attr_t;
+
+typedef struct {
+ const char *pd_name;
+ zfs_proptype_t pd_proptype;
+ uint64_t pd_numdefault;
+ const char *pd_strdefault;
+ prop_attr_t pd_attr;
+ int pd_types;
+ const char *pd_values;
+ const char *pd_colname;
+ boolean_t pd_rightalign;
+ boolean_t pd_visible;
+} prop_desc_t;
+
+static prop_desc_t zfs_prop_table[] = {
+ { "type", prop_type_string, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", B_TRUE,
+ B_TRUE },
+ { "creation", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY, "<date>", "CREATION", B_FALSE, B_TRUE },
+ { "used", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY, "<size>", "USED", B_TRUE, B_TRUE },
+ { "available", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", B_TRUE,
+ B_TRUE },
+ { "referenced", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY,
+ "<size>", "REFER", B_TRUE, B_TRUE },
+ { "compressratio", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", B_TRUE,
+ B_TRUE },
+ { "mounted", prop_type_boolean, 0, NULL, prop_readonly,
+ ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", B_TRUE, B_TRUE },
+ { "origin", prop_type_string, 0, NULL, prop_readonly,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN",
+ B_FALSE, B_TRUE },
+ { "quota", prop_type_number, 0, NULL, prop_default,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", B_TRUE, B_TRUE },
+ { "reservation", prop_type_number, 0, NULL, prop_default,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "RESERV", B_TRUE, B_TRUE },
+ { "volsize", prop_type_number, 0, NULL, prop_default,
+ ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", B_TRUE, B_TRUE },
+ { "volblocksize", prop_type_number, 8192, NULL, prop_readonly,
+ ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_TRUE,
+ B_TRUE },
+ { "recordsize", prop_type_number, SPA_MAXBLOCKSIZE, NULL,
+ prop_inherit,
+ ZFS_TYPE_FILESYSTEM,
+ "512 to 128k, power of 2", "RECSIZE", B_TRUE, B_TRUE },
+ { "mountpoint", prop_type_string, 0, "/", prop_inherit,
+ ZFS_TYPE_FILESYSTEM,
+ "<path> | legacy | none", "MOUNTPOINT", B_FALSE, B_TRUE },
+ { "sharenfs", prop_type_string, 0, "off", prop_inherit,
+ ZFS_TYPE_FILESYSTEM,
+ "on | off | exports(5) options", "SHARENFS", B_FALSE, B_TRUE },
+ { "checksum", prop_type_index, ZIO_CHECKSUM_DEFAULT, "on",
+ prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", B_TRUE,
+ B_TRUE },
+ { "compression", prop_type_index, ZIO_COMPRESS_DEFAULT, "off",
+ prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", B_TRUE, B_TRUE },
+ { "atime", prop_type_boolean, 1, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM,
+ "on | off", "ATIME", B_TRUE, B_TRUE },
+ { "devices", prop_type_boolean, 1, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "DEVICES", B_TRUE, B_TRUE },
+ { "exec", prop_type_boolean, 1, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "EXEC", B_TRUE, B_TRUE },
+ { "setuid", prop_type_boolean, 1, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
+ B_TRUE, B_TRUE },
+ { "readonly", prop_type_boolean, 0, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off", "RDONLY", B_TRUE, B_TRUE },
+ { "jailed", prop_type_boolean, 0, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM,
+ "on | off", "JAILED", B_TRUE, B_TRUE },
+ { "snapdir", prop_type_index, ZFS_SNAPDIR_HIDDEN, "hidden",
+ prop_inherit,
+ ZFS_TYPE_FILESYSTEM,
+ "hidden | visible", "SNAPDIR", B_TRUE, B_TRUE },
+ { "aclmode", prop_type_index, ZFS_ACL_GROUPMASK, "groupmask",
+ prop_inherit, ZFS_TYPE_FILESYSTEM,
+ "discard | groupmask | passthrough", "ACLMODE", B_TRUE, B_TRUE },
+ { "aclinherit", prop_type_index, ZFS_ACL_SECURE, "secure",
+ prop_inherit, ZFS_TYPE_FILESYSTEM,
+ "discard | noallow | secure | passthrough", "ACLINHERIT", B_TRUE,
+ B_TRUE },
+ { "createtxg", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY, NULL, NULL, B_FALSE, B_FALSE },
+ { "name", prop_type_string, 0, NULL, prop_readonly,
+ ZFS_TYPE_ANY, NULL, "NAME", B_FALSE, B_FALSE },
+ { "canmount", prop_type_boolean, 1, NULL, prop_default,
+ ZFS_TYPE_FILESYSTEM,
+ "on | off", "CANMOUNT", B_TRUE, B_TRUE },
+ { "shareiscsi", prop_type_string, 0, "off", prop_inherit,
+ ZFS_TYPE_ANY,
+ "on | off | type=<type>", "SHAREISCSI", B_FALSE, B_TRUE },
+ { "iscsioptions", prop_type_string, 0, NULL, prop_inherit,
+ ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE, B_FALSE },
+ { "xattr", prop_type_boolean, 1, NULL, prop_inherit,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "XATTR", B_TRUE, B_TRUE },
+ { "numclones", prop_type_number, 0, NULL, prop_readonly,
+ ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE, B_FALSE },
+ { "copies", prop_type_index, 1, "1", prop_inherit,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE },
+ { "bootfs", prop_type_string, 0, NULL, prop_default,
+ ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE },
+};
+
+#define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t)))
+
+/*
+ * Returns TRUE if the property applies to the given dataset types.
+ */
+int
+zfs_prop_valid_for_type(zfs_prop_t prop, int types)
+{
+ return ((zfs_prop_table[prop].pd_types & types) != 0);
+}
+
+/*
+ * Determine if the specified property is visible or not.
+ */
+boolean_t
+zfs_prop_is_visible(zfs_prop_t prop)
+{
+ if (prop < 0)
+ return (B_FALSE);
+
+ return (zfs_prop_table[prop].pd_visible);
+}
+
+/*
+ * Iterate over all properties, calling back into the specified function
+ * for each property. We will continue to iterate until we either
+ * reach the end or the callback function something other than
+ * ZFS_PROP_CONT.
+ */
+zfs_prop_t
+zfs_prop_iter_common(zfs_prop_f func, void *cb, zfs_type_t type,
+ boolean_t show_all)
+{
+ int i;
+
+ for (i = 0; i < ZFS_PROP_COUNT; i++) {
+ if (zfs_prop_valid_for_type(i, type) &&
+ (zfs_prop_is_visible(i) || show_all)) {
+ if (func(i, cb) != ZFS_PROP_CONT)
+ return (i);
+ }
+ }
+ return (ZFS_PROP_CONT);
+}
+
+zfs_prop_t
+zfs_prop_iter(zfs_prop_f func, void *cb, boolean_t show_all)
+{
+ return (zfs_prop_iter_common(func, cb, ZFS_TYPE_ANY, show_all));
+}
+
+zpool_prop_t
+zpool_prop_iter(zpool_prop_f func, void *cb, boolean_t show_all)
+{
+ return (zfs_prop_iter_common(func, cb, ZFS_TYPE_POOL, show_all));
+}
+
+zfs_proptype_t
+zfs_prop_get_type(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_proptype);
+}
+
+static boolean_t
+propname_match(const char *p, zfs_prop_t prop, size_t len)
+{
+ const char *propname = zfs_prop_table[prop].pd_name;
+#ifndef _KERNEL
+ const char *colname = zfs_prop_table[prop].pd_colname;
+ int c;
+#endif
+
+#ifndef _KERNEL
+ if (colname == NULL)
+ return (B_FALSE);
+#endif
+
+ if (len == strlen(propname) &&
+ strncmp(p, propname, len) == 0)
+ return (B_TRUE);
+
+#ifndef _KERNEL
+ if (len != strlen(colname))
+ return (B_FALSE);
+
+ for (c = 0; c < len; c++)
+ if (p[c] != tolower(colname[c]))
+ break;
+
+ return (colname[c] == '\0');
+#else
+ return (B_FALSE);
+#endif
+}
+
+zfs_prop_t
+zfs_name_to_prop_cb(zfs_prop_t prop, void *cb_data)
+{
+ const char *propname = cb_data;
+
+ if (propname_match(propname, prop, strlen(propname)))
+ return (prop);
+
+ return (ZFS_PROP_CONT);
+}
+
+/*
+ * Given a property name and its type, returns the corresponding property ID.
+ */
+zfs_prop_t
+zfs_name_to_prop_common(const char *propname, zfs_type_t type)
+{
+ zfs_prop_t prop;
+
+ prop = zfs_prop_iter_common(zfs_name_to_prop_cb, (void *)propname,
+ type, B_TRUE);
+ return (prop == ZFS_PROP_CONT ? ZFS_PROP_INVAL : prop);
+}
+
+/*
+ * Given a zfs dataset property name, returns the corresponding property ID.
+ */
+zfs_prop_t
+zfs_name_to_prop(const char *propname)
+{
+ return (zfs_name_to_prop_common(propname, ZFS_TYPE_ANY));
+}
+
+/*
+ * Given a pool property name, returns the corresponding property ID.
+ */
+zpool_prop_t
+zpool_name_to_prop(const char *propname)
+{
+ return (zfs_name_to_prop_common(propname, ZFS_TYPE_POOL));
+}
+
+/*
+ * For user property names, we allow all lowercase alphanumeric characters, plus
+ * a few useful punctuation characters.
+ */
+static int
+valid_char(char c)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '_' || c == '.' || c == ':');
+}
+
+/*
+ * Returns true if this is a valid user-defined property (one with a ':').
+ */
+boolean_t
+zfs_prop_user(const char *name)
+{
+ int i;
+ char c;
+ boolean_t foundsep = B_FALSE;
+
+ for (i = 0; i < strlen(name); i++) {
+ c = name[i];
+ if (!valid_char(c))
+ return (B_FALSE);
+ if (c == ':')
+ foundsep = B_TRUE;
+ }
+
+ if (!foundsep)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Return the default value for the given property.
+ */
+const char *
+zfs_prop_default_string(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Returns TRUE if the property is readonly.
+ */
+int
+zfs_prop_readonly(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == prop_readonly);
+}
+
+/*
+ * Given a dataset property ID, returns the corresponding name.
+ * Assuming the zfs dataset propety ID is valid.
+ */
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_name);
+}
+
+/*
+ * Given a pool property ID, returns the corresponding name.
+ * Assuming the pool propety ID is valid.
+ */
+const char *
+zpool_prop_to_name(zpool_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_name);
+}
+
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+int
+zfs_prop_inheritable(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == prop_inherit);
+}
+
+typedef struct zfs_index {
+ const char *name;
+ uint64_t index;
+} zfs_index_t;
+
+static zfs_index_t checksum_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
+ { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { NULL }
+};
+
+static zfs_index_t compress_table[] = {
+ { "on", ZIO_COMPRESS_ON },
+ { "off", ZIO_COMPRESS_OFF },
+ { "lzjb", ZIO_COMPRESS_LZJB },
+ { "gzip", ZIO_COMPRESS_GZIP_6 }, /* the default gzip level */
+ { "gzip-1", ZIO_COMPRESS_GZIP_1 },
+ { "gzip-2", ZIO_COMPRESS_GZIP_2 },
+ { "gzip-3", ZIO_COMPRESS_GZIP_3 },
+ { "gzip-4", ZIO_COMPRESS_GZIP_4 },
+ { "gzip-5", ZIO_COMPRESS_GZIP_5 },
+ { "gzip-6", ZIO_COMPRESS_GZIP_6 },
+ { "gzip-7", ZIO_COMPRESS_GZIP_7 },
+ { "gzip-8", ZIO_COMPRESS_GZIP_8 },
+ { "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { NULL }
+};
+
+static zfs_index_t snapdir_table[] = {
+ { "hidden", ZFS_SNAPDIR_HIDDEN },
+ { "visible", ZFS_SNAPDIR_VISIBLE },
+ { NULL }
+};
+
+static zfs_index_t acl_mode_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "groupmask", ZFS_ACL_GROUPMASK },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { NULL }
+};
+
+static zfs_index_t acl_inherit_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "noallow", ZFS_ACL_NOALLOW },
+ { "secure", ZFS_ACL_SECURE },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { NULL }
+};
+
+static zfs_index_t copies_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { NULL }
+};
+
+static zfs_index_t *
+zfs_prop_index_table(zfs_prop_t prop)
+{
+ switch (prop) {
+ case ZFS_PROP_CHECKSUM:
+ return (checksum_table);
+ case ZFS_PROP_COMPRESSION:
+ return (compress_table);
+ case ZFS_PROP_SNAPDIR:
+ return (snapdir_table);
+ case ZFS_PROP_ACLMODE:
+ return (acl_mode_table);
+ case ZFS_PROP_ACLINHERIT:
+ return (acl_inherit_table);
+ case ZFS_PROP_COPIES:
+ return (copies_table);
+ default:
+ return (NULL);
+ }
+}
+
+
+/*
+ * Tables of index types, plus functions to convert between the user view
+ * (strings) and internal representation (uint64_t).
+ */
+int
+zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
+{
+ zfs_index_t *table;
+ int i;
+
+ if ((table = zfs_prop_index_table(prop)) == NULL)
+ return (-1);
+
+ for (i = 0; table[i].name != NULL; i++) {
+ if (strcmp(string, table[i].name) == 0) {
+ *index = table[i].index;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+int
+zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
+{
+ zfs_index_t *table;
+ int i;
+
+ if ((table = zfs_prop_index_table(prop)) == NULL)
+ return (-1);
+
+ for (i = 0; table[i].name != NULL; i++) {
+ if (table[i].index == index) {
+ *string = table[i].name;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Returns a string describing the set of acceptable values for the given
+ * zfs property, or NULL if it cannot be set.
+ */
+const char *
+zfs_prop_values(zfs_prop_t prop)
+{
+ if (zfs_prop_table[prop].pd_types == ZFS_TYPE_POOL)
+ return (NULL);
+
+ return (zfs_prop_table[prop].pd_values);
+}
+
+/*
+ * Returns a string describing the set of acceptable values for the given
+ * zpool property, or NULL if it cannot be set.
+ */
+const char *
+zpool_prop_values(zfs_prop_t prop)
+{
+ if (zfs_prop_table[prop].pd_types != ZFS_TYPE_POOL)
+ return (NULL);
+
+ return (zfs_prop_table[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if this property is a string type. Note that index types
+ * (compression, checksum) are treated as strings in userland, even though they
+ * are stored numerically on disk.
+ */
+int
+zfs_prop_is_string(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_proptype == prop_type_string ||
+ zfs_prop_table[prop].pd_proptype == prop_type_index);
+}
+
+/*
+ * Returns the column header for the given property. Used only in
+ * 'zfs list -o', but centralized here with the other property information.
+ */
+const char *
+zfs_prop_column_name(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_colname);
+}
+
+/*
+ * Returns whether the given property should be displayed right-justified for
+ * 'zfs list'.
+ */
+boolean_t
+zfs_prop_align_right(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_rightalign);
+}
+
+/*
+ * Determines the minimum width for the column, and indicates whether it's fixed
+ * or not. Only string columns are non-fixed.
+ */
+size_t
+zfs_prop_width(zfs_prop_t prop, boolean_t *fixed)
+{
+ prop_desc_t *pd = &zfs_prop_table[prop];
+ zfs_index_t *idx;
+ size_t ret;
+ int i;
+
+ *fixed = B_TRUE;
+
+ /*
+ * Start with the width of the column name.
+ */
+ ret = strlen(pd->pd_colname);
+
+ /*
+ * For fixed-width values, make sure the width is large enough to hold
+ * any possible value.
+ */
+ switch (pd->pd_proptype) {
+ case prop_type_number:
+ /*
+ * The maximum length of a human-readable number is 5 characters
+ * ("20.4M", for example).
+ */
+ if (ret < 5)
+ ret = 5;
+ /*
+ * 'creation' is handled specially because it's a number
+ * internally, but displayed as a date string.
+ */
+ if (prop == ZFS_PROP_CREATION)
+ *fixed = B_FALSE;
+ break;
+ case prop_type_boolean:
+ /*
+ * The maximum length of a boolean value is 3 characters, for
+ * "off".
+ */
+ if (ret < 3)
+ ret = 3;
+ break;
+ case prop_type_index:
+ idx = zfs_prop_index_table(prop);
+ for (i = 0; idx[i].name != NULL; i++) {
+ if (strlen(idx[i].name) > ret)
+ ret = strlen(idx[i].name);
+ }
+ break;
+
+ case prop_type_string:
+ *fixed = B_FALSE;
+ break;
+ }
+
+ return (ret);
+}
+
+#endif
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/contrib/opensolaris/common/zfs/zfs_prop.h
new file mode 100644
index 000000000000..133e740ce6bc
--- /dev/null
+++ b/sys/contrib/opensolaris/common/zfs/zfs_prop.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_PROP_H
+#define _ZFS_PROP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * For index types (e.g. compression and checksum), we want the numeric value
+ * in the kernel, but the string value in userland.
+ */
+typedef enum {
+ prop_type_number, /* numeric value */
+ prop_type_string, /* string value */
+ prop_type_boolean, /* boolean value */
+ prop_type_index /* numeric value indexed by string */
+} zfs_proptype_t;
+
+zfs_proptype_t zfs_prop_get_type(zfs_prop_t);
+size_t zfs_prop_width(zfs_prop_t, boolean_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_PROP_H */
diff --git a/sys/contrib/opensolaris/uts/common/Makefile.files b/sys/contrib/opensolaris/uts/common/Makefile.files
new file mode 100644
index 000000000000..1800e792fa1e
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/Makefile.files
@@ -0,0 +1,101 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+# This Makefile defines all file modules for the directory uts/common
+# and its children. These are the source files which may be considered
+# common to all SunOS systems.
+
+ZFS_COMMON_OBJS += \
+ arc.o \
+ bplist.o \
+ dbuf.o \
+ dmu.o \
+ dmu_send.o \
+ dmu_object.o \
+ dmu_objset.o \
+ dmu_traverse.o \
+ dmu_tx.o \
+ dnode.o \
+ dnode_sync.o \
+ dsl_dir.o \
+ dsl_dataset.o \
+ dsl_pool.o \
+ dsl_synctask.o \
+ dmu_zfetch.o \
+ dsl_prop.o \
+ fletcher.o \
+ gzip.o \
+ lzjb.o \
+ metaslab.o \
+ refcount.o \
+ sha256.o \
+ spa.o \
+ spa_config.o \
+ spa_errlog.o \
+ spa_history.o \
+ spa_misc.o \
+ space_map.o \
+ txg.o \
+ uberblock.o \
+ unique.o \
+ vdev.o \
+ vdev_cache.o \
+ vdev_label.o \
+ vdev_mirror.o \
+ vdev_missing.o \
+ vdev_queue.o \
+ vdev_raidz.o \
+ vdev_root.o \
+ zap.o \
+ zap_leaf.o \
+ zap_micro.o \
+ zfs_byteswap.o \
+ zfs_fm.o \
+ zfs_znode.o \
+ zil.o \
+ zio.o \
+ zio_checksum.o \
+ zio_compress.o \
+ zio_inject.o
+
+ZFS_SHARED_OBJS += \
+ zfs_namecheck.o \
+ zfs_prop.o
+
+ZFS_OBJS += \
+ $(ZFS_COMMON_OBJS) \
+ $(ZFS_SHARED_OBJS) \
+ zfs_acl.o \
+ zfs_ctldir.o \
+ zfs_dir.o \
+ zfs_ioctl.o \
+ zfs_log.o \
+ zfs_replay.o \
+ zfs_rlock.o \
+ zfs_vfsops.o \
+ zfs_vnops.o \
+ zvol.o
diff --git a/sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S b/sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S
new file mode 100644
index 000000000000..1fa3c05b3317
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/arch/amd64/atomic.S
@@ -0,0 +1,564 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .ident "%Z%%M% %I% %E% SMI"
+
+ .file "%M%"
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(_KERNEL)
+ /*
+ * Legacy kernel interfaces; they will go away (eventually).
+ */
+ ANSI_PRAGMA_WEAK2(cas8,atomic_cas_8,function)
+ ANSI_PRAGMA_WEAK2(cas32,atomic_cas_32,function)
+ ANSI_PRAGMA_WEAK2(cas64,atomic_cas_64,function)
+ ANSI_PRAGMA_WEAK2(caslong,atomic_cas_ulong,function)
+ ANSI_PRAGMA_WEAK2(casptr,atomic_cas_ptr,function)
+ ANSI_PRAGMA_WEAK2(atomic_and_long,atomic_and_ulong,function)
+ ANSI_PRAGMA_WEAK2(atomic_or_long,atomic_or_ulong,function)
+#endif
+
+ ENTRY(atomic_inc_8)
+ ALTENTRY(atomic_inc_uchar)
+ lock
+ incb (%rdi)
+ ret
+ SET_SIZE(atomic_inc_uchar)
+ SET_SIZE(atomic_inc_8)
+
+ ENTRY(atomic_inc_16)
+ ALTENTRY(atomic_inc_ushort)
+ lock
+ incw (%rdi)
+ ret
+ SET_SIZE(atomic_inc_ushort)
+ SET_SIZE(atomic_inc_16)
+
+ ENTRY(atomic_inc_32)
+ ALTENTRY(atomic_inc_uint)
+ lock
+ incl (%rdi)
+ ret
+ SET_SIZE(atomic_inc_uint)
+ SET_SIZE(atomic_inc_32)
+
+ ENTRY(atomic_inc_64)
+ ALTENTRY(atomic_inc_ulong)
+ lock
+ incq (%rdi)
+ ret
+ SET_SIZE(atomic_inc_ulong)
+ SET_SIZE(atomic_inc_64)
+
+ ENTRY(atomic_inc_8_nv)
+ ALTENTRY(atomic_inc_uchar_nv)
+ movb (%rdi), %al
+1:
+ leaq 1(%rax), %rcx
+ lock
+ cmpxchgb %cl, (%rdi)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_inc_uchar_nv)
+ SET_SIZE(atomic_inc_8_nv)
+
+ ENTRY(atomic_inc_16_nv)
+ ALTENTRY(atomic_inc_ushort_nv)
+ movw (%rdi), %ax
+1:
+ leaq 1(%rax), %rcx
+ lock
+ cmpxchgw %cx, (%rdi)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_inc_ushort_nv)
+ SET_SIZE(atomic_inc_16_nv)
+
+ ENTRY(atomic_inc_32_nv)
+ ALTENTRY(atomic_inc_uint_nv)
+ movl (%rdi), %eax
+1:
+ leaq 1(%rax), %rcx
+ lock
+ cmpxchgl %ecx, (%rdi)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_inc_uint_nv)
+ SET_SIZE(atomic_inc_32_nv)
+
+ ENTRY(atomic_inc_64_nv)
+ ALTENTRY(atomic_inc_ulong_nv)
+ movq (%rdi), %rax
+1:
+ leaq 1(%rax), %rcx
+ lock
+ cmpxchgq %rcx, (%rdi)
+ jne 1b
+ movq %rcx, %rax
+ ret
+ SET_SIZE(atomic_inc_ulong_nv)
+ SET_SIZE(atomic_inc_64_nv)
+
+ ENTRY(atomic_dec_8)
+ ALTENTRY(atomic_dec_uchar)
+ lock
+ decb (%rdi)
+ ret
+ SET_SIZE(atomic_dec_uchar)
+ SET_SIZE(atomic_dec_8)
+
+ ENTRY(atomic_dec_16)
+ ALTENTRY(atomic_dec_ushort)
+ lock
+ decw (%rdi)
+ ret
+ SET_SIZE(atomic_dec_ushort)
+ SET_SIZE(atomic_dec_16)
+
+ ENTRY(atomic_dec_32)
+ ALTENTRY(atomic_dec_uint)
+ lock
+ decl (%rdi)
+ ret
+ SET_SIZE(atomic_dec_uint)
+ SET_SIZE(atomic_dec_32)
+
+ ENTRY(atomic_dec_64)
+ ALTENTRY(atomic_dec_ulong)
+ lock
+ decq (%rdi)
+ ret
+ SET_SIZE(atomic_dec_ulong)
+ SET_SIZE(atomic_dec_64)
+
+ ENTRY(atomic_dec_8_nv)
+ ALTENTRY(atomic_dec_uchar_nv)
+ movb (%rdi), %al
+1:
+ leaq -1(%rax), %rcx
+ lock
+ cmpxchgb %cl, (%rdi)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_dec_uchar_nv)
+ SET_SIZE(atomic_dec_8_nv)
+
+ ENTRY(atomic_dec_16_nv)
+ ALTENTRY(atomic_dec_ushort_nv)
+ movw (%rdi), %ax
+1:
+ leaq -1(%rax), %rcx
+ lock
+ cmpxchgw %cx, (%rdi)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_dec_ushort_nv)
+ SET_SIZE(atomic_dec_16_nv)
+
+ ENTRY(atomic_dec_32_nv)
+ ALTENTRY(atomic_dec_uint_nv)
+ movl (%rdi), %eax
+1:
+ leaq -1(%rax), %rcx
+ lock
+ cmpxchgl %ecx, (%rdi)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_dec_uint_nv)
+ SET_SIZE(atomic_dec_32_nv)
+
+ ENTRY(atomic_dec_64_nv)
+ ALTENTRY(atomic_dec_ulong_nv)
+ movq (%rdi), %rax
+1:
+ leaq -1(%rax), %rcx
+ lock
+ cmpxchgq %rcx, (%rdi)
+ jne 1b
+ movq %rcx, %rax
+ ret
+ SET_SIZE(atomic_dec_ulong_nv)
+ SET_SIZE(atomic_dec_64_nv)
+
+ ENTRY(atomic_or_8)
+ ALTENTRY(atomic_or_uchar)
+ lock
+ orb %sil, (%rdi)
+ ret
+ SET_SIZE(atomic_or_uchar)
+ SET_SIZE(atomic_or_8)
+
+ ENTRY(atomic_or_16)
+ ALTENTRY(atomic_or_ushort)
+ lock
+ orw %si, (%rdi)
+ ret
+ SET_SIZE(atomic_or_ushort)
+ SET_SIZE(atomic_or_16)
+
+ ENTRY(atomic_or_32)
+ ALTENTRY(atomic_or_uint)
+ lock
+ orl %esi, (%rdi)
+ ret
+ SET_SIZE(atomic_or_uint)
+ SET_SIZE(atomic_or_32)
+
+ ENTRY(atomic_or_64)
+ ALTENTRY(atomic_or_ulong)
+ lock
+ orq %rsi, (%rdi)
+ ret
+ SET_SIZE(atomic_or_ulong)
+ SET_SIZE(atomic_or_64)
+
+ ENTRY(atomic_and_8)
+ ALTENTRY(atomic_and_uchar)
+ lock
+ andb %sil, (%rdi)
+ ret
+ SET_SIZE(atomic_and_uchar)
+ SET_SIZE(atomic_and_8)
+
+ ENTRY(atomic_and_16)
+ ALTENTRY(atomic_and_ushort)
+ lock
+ andw %si, (%rdi)
+ ret
+ SET_SIZE(atomic_and_ushort)
+ SET_SIZE(atomic_and_16)
+
+ ENTRY(atomic_and_32)
+ ALTENTRY(atomic_and_uint)
+ lock
+ andl %esi, (%rdi)
+ ret
+ SET_SIZE(atomic_and_uint)
+ SET_SIZE(atomic_and_32)
+
+ ENTRY(atomic_and_64)
+ ALTENTRY(atomic_and_ulong)
+ lock
+ andq %rsi, (%rdi)
+ ret
+ SET_SIZE(atomic_and_ulong)
+ SET_SIZE(atomic_and_64)
+
+ ENTRY(atomic_add_8_nv)
+ ALTENTRY(atomic_add_char_nv)
+ movb (%rdi), %al
+1:
+ movb %sil, %cl
+ addb %al, %cl
+ lock
+ cmpxchgb %cl, (%rdi)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_add_char_nv)
+ SET_SIZE(atomic_add_8_nv)
+
+ ENTRY(atomic_add_16_nv)
+ ALTENTRY(atomic_add_short_nv)
+ movw (%rdi), %ax
+1:
+ movw %si, %cx
+ addw %ax, %cx
+ lock
+ cmpxchgw %cx, (%rdi)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_add_short_nv)
+ SET_SIZE(atomic_add_16_nv)
+
+ ENTRY(atomic_add_32_nv)
+ ALTENTRY(atomic_add_int_nv)
+ movl (%rdi), %eax
+1:
+ movl %esi, %ecx
+ addl %eax, %ecx
+ lock
+ cmpxchgl %ecx, (%rdi)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_add_int_nv)
+ SET_SIZE(atomic_add_32_nv)
+
+ ENTRY(atomic_add_64_nv)
+ ALTENTRY(atomic_add_ptr_nv)
+ ALTENTRY(atomic_add_long_nv)
+ movq (%rdi), %rax
+1:
+ movq %rsi, %rcx
+ addq %rax, %rcx
+ lock
+ cmpxchgq %rcx, (%rdi)
+ jne 1b
+ movq %rcx, %rax
+ ret
+ SET_SIZE(atomic_add_long_nv)
+ SET_SIZE(atomic_add_ptr_nv)
+ SET_SIZE(atomic_add_64_nv)
+
+ ENTRY(atomic_and_8_nv)
+ ALTENTRY(atomic_and_uchar_nv)
+ movb (%rdi), %al
+1:
+ movb %sil, %cl
+ andb %al, %cl
+ lock
+ cmpxchgb %cl, (%rdi)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_and_uchar_nv)
+ SET_SIZE(atomic_and_8_nv)
+
+ ENTRY(atomic_and_16_nv)
+ ALTENTRY(atomic_and_ushort_nv)
+ movw (%rdi), %ax
+1:
+ movw %si, %cx
+ andw %ax, %cx
+ lock
+ cmpxchgw %cx, (%rdi)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_and_ushort_nv)
+ SET_SIZE(atomic_and_16_nv)
+
+ ENTRY(atomic_and_32_nv)
+ ALTENTRY(atomic_and_uint_nv)
+ movl (%rdi), %eax
+1:
+ movl %esi, %ecx
+ andl %eax, %ecx
+ lock
+ cmpxchgl %ecx, (%rdi)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_and_uint_nv)
+ SET_SIZE(atomic_and_32_nv)
+
+ ENTRY(atomic_and_64_nv)
+ ALTENTRY(atomic_and_ulong_nv)
+ movq (%rdi), %rax
+1:
+ movq %rsi, %rcx
+ andq %rax, %rcx
+ lock
+ cmpxchgq %rcx, (%rdi)
+ jne 1b
+ movq %rcx, %rax
+ ret
+ SET_SIZE(atomic_and_ulong_nv)
+ SET_SIZE(atomic_and_64_nv)
+
+ ENTRY(atomic_or_8_nv)
+ ALTENTRY(atomic_or_uchar_nv)
+ movb (%rdi), %al
+1:
+ movb %sil, %cl
+ orb %al, %cl
+ lock
+ cmpxchgb %cl, (%rdi)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_and_uchar_nv)
+ SET_SIZE(atomic_and_8_nv)
+
+ ENTRY(atomic_or_16_nv)
+ ALTENTRY(atomic_or_ushort_nv)
+ movw (%rdi), %ax
+1:
+ movw %si, %cx
+ orw %ax, %cx
+ lock
+ cmpxchgw %cx, (%rdi)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_or_ushort_nv)
+ SET_SIZE(atomic_or_16_nv)
+
+ ENTRY(atomic_or_32_nv)
+ ALTENTRY(atomic_or_uint_nv)
+ movl (%rdi), %eax
+1:
+ movl %esi, %ecx
+ orl %eax, %ecx
+ lock
+ cmpxchgl %ecx, (%rdi)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_or_uint_nv)
+ SET_SIZE(atomic_or_32_nv)
+
+ ENTRY(atomic_or_64_nv)
+ ALTENTRY(atomic_or_ulong_nv)
+ movq (%rdi), %rax
+1:
+ movq %rsi, %rcx
+ orq %rax, %rcx
+ lock
+ cmpxchgq %rcx, (%rdi)
+ jne 1b
+ movq %rcx, %rax
+ ret
+ SET_SIZE(atomic_or_ulong_nv)
+ SET_SIZE(atomic_or_64_nv)
+
+ ENTRY(atomic_cas_8)
+ ALTENTRY(atomic_cas_uchar)
+ movzbl %sil, %eax
+ lock
+ cmpxchgb %dl, (%rdi)
+ ret
+ SET_SIZE(atomic_cas_uchar)
+ SET_SIZE(atomic_cas_8)
+
+ ENTRY(atomic_cas_16)
+ ALTENTRY(atomic_cas_ushort)
+ movzwl %si, %eax
+ lock
+ cmpxchgw %dx, (%rdi)
+ ret
+ SET_SIZE(atomic_cas_ushort)
+ SET_SIZE(atomic_cas_16)
+
+ ENTRY(atomic_cas_32)
+ ALTENTRY(atomic_cas_uint)
+ movl %esi, %eax
+ lock
+ cmpxchgl %edx, (%rdi)
+ ret
+ SET_SIZE(atomic_cas_uint)
+ SET_SIZE(atomic_cas_32)
+
+ ENTRY(atomic_cas_64)
+ ALTENTRY(atomic_cas_ulong)
+ ALTENTRY(atomic_cas_ptr)
+ movq %rsi, %rax
+ lock
+ cmpxchgq %rdx, (%rdi)
+ ret
+ SET_SIZE(atomic_cas_ptr)
+ SET_SIZE(atomic_cas_ulong)
+ SET_SIZE(atomic_cas_64)
+
+ ENTRY(atomic_swap_8)
+ ALTENTRY(atomic_swap_uchar)
+ movzbl %sil, %eax
+ lock
+ xchgb %al, (%rdi)
+ ret
+ SET_SIZE(atomic_swap_uchar)
+ SET_SIZE(atomic_swap_8)
+
+ ENTRY(atomic_swap_16)
+ ALTENTRY(atomic_swap_ushort)
+ movzwl %si, %eax
+ lock
+ xchgw %ax, (%rdi)
+ ret
+ SET_SIZE(atomic_swap_ushort)
+ SET_SIZE(atomic_swap_16)
+
+ ENTRY(atomic_swap_32)
+ ALTENTRY(atomic_swap_uint)
+ movl %esi, %eax
+ lock
+ xchgl %eax, (%rdi)
+ ret
+ SET_SIZE(atomic_swap_uint)
+ SET_SIZE(atomic_swap_32)
+
+ ENTRY(atomic_swap_64)
+ ALTENTRY(atomic_swap_ulong)
+ ALTENTRY(atomic_swap_ptr)
+ movq %rsi, %rax
+ lock
+ xchgq %rax, (%rdi)
+ ret
+ SET_SIZE(atomic_swap_ptr)
+ SET_SIZE(atomic_swap_ulong)
+ SET_SIZE(atomic_swap_64)
+
+ ENTRY(atomic_set_long_excl)
+ xorl %eax, %eax
+ lock
+ btsq %rsi, (%rdi)
+ jnc 1f
+ decl %eax
+1:
+ ret
+ SET_SIZE(atomic_set_long_excl)
+
+ ENTRY(atomic_clear_long_excl)
+ xorl %eax, %eax
+ lock
+ btrq %rsi, (%rdi)
+ jc 1f
+ decl %eax
+1:
+ ret
+ SET_SIZE(atomic_clear_long_excl)
+
+ ENTRY(membar_enter)
+ ALTENTRY(membar_exit)
+ mfence
+ ret
+ SET_SIZE(membar_exit)
+ SET_SIZE(membar_enter)
+
+ ENTRY(membar_producer)
+ sfence
+ ret
+ SET_SIZE(membar_producer)
+
+ ENTRY(membar_consumer)
+ lfence
+ ret
+ SET_SIZE(membar_consumer)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/opensolaris/uts/common/arch/i386/atomic.S b/sys/contrib/opensolaris/uts/common/arch/i386/atomic.S
new file mode 100644
index 000000000000..4a1d37521119
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/arch/i386/atomic.S
@@ -0,0 +1,659 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .ident "%Z%%M% %I% %E% SMI"
+
+ .file "%M%"
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(_KERNEL)
+ /*
+ * Legacy kernel interfaces; they will go away (eventually).
+ */
+ ANSI_PRAGMA_WEAK2(cas8,atomic_cas_8,function)
+ ANSI_PRAGMA_WEAK2(cas32,atomic_cas_32,function)
+ ANSI_PRAGMA_WEAK2(cas64,atomic_cas_64,function)
+ ANSI_PRAGMA_WEAK2(caslong,atomic_cas_ulong,function)
+ ANSI_PRAGMA_WEAK2(casptr,atomic_cas_ptr,function)
+ ANSI_PRAGMA_WEAK2(atomic_and_long,atomic_and_ulong,function)
+ ANSI_PRAGMA_WEAK2(atomic_or_long,atomic_or_ulong,function)
+#endif
+
+ ENTRY(atomic_inc_8)
+ ALTENTRY(atomic_inc_uchar)
+ movl 4(%esp), %eax
+ lock
+ incb (%eax)
+ ret
+ SET_SIZE(atomic_inc_uchar)
+ SET_SIZE(atomic_inc_8)
+
+ ENTRY(atomic_inc_16)
+ ALTENTRY(atomic_inc_ushort)
+ movl 4(%esp), %eax
+ lock
+ incw (%eax)
+ ret
+ SET_SIZE(atomic_inc_ushort)
+ SET_SIZE(atomic_inc_16)
+
+ ENTRY(atomic_inc_32)
+ ALTENTRY(atomic_inc_uint)
+ ALTENTRY(atomic_inc_ulong)
+ movl 4(%esp), %eax
+ lock
+ incl (%eax)
+ ret
+ SET_SIZE(atomic_inc_ulong)
+ SET_SIZE(atomic_inc_uint)
+ SET_SIZE(atomic_inc_32)
+
+ ENTRY(atomic_inc_8_nv)
+ ALTENTRY(atomic_inc_uchar_nv)
+ movl 4(%esp), %edx
+ movb (%edx), %al
+1:
+ leal 1(%eax), %ecx
+ lock
+ cmpxchgb %cl, (%edx)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_inc_uchar_nv)
+ SET_SIZE(atomic_inc_8_nv)
+
+ ENTRY(atomic_inc_16_nv)
+ ALTENTRY(atomic_inc_ushort_nv)
+ movl 4(%esp), %edx
+ movw (%edx), %ax
+1:
+ leal 1(%eax), %ecx
+ lock
+ cmpxchgw %cx, (%edx)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_inc_ushort_nv)
+ SET_SIZE(atomic_inc_16_nv)
+
+ ENTRY(atomic_inc_32_nv)
+ ALTENTRY(atomic_inc_uint_nv)
+ ALTENTRY(atomic_inc_ulong_nv)
+ movl 4(%esp), %edx
+ movl (%edx), %eax
+1:
+ leal 1(%eax), %ecx
+ lock
+ cmpxchgl %ecx, (%edx)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_inc_ulong_nv)
+ SET_SIZE(atomic_inc_uint_nv)
+ SET_SIZE(atomic_inc_32_nv)
+
+ ENTRY(atomic_inc_64)
+ ALTENTRY(atomic_inc_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi
+ movl (%edi), %eax
+ movl 4(%edi), %edx
+1:
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ incl %ebx
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ lock
+ cmpxchg8b (%edi)
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_inc_64_nv)
+ SET_SIZE(atomic_inc_64)
+
+ ENTRY(atomic_dec_8)
+ ALTENTRY(atomic_dec_uchar)
+ movl 4(%esp), %eax
+ lock
+ decb (%eax)
+ ret
+ SET_SIZE(atomic_dec_uchar)
+ SET_SIZE(atomic_dec_8)
+
+ ENTRY(atomic_dec_16)
+ ALTENTRY(atomic_dec_ushort)
+ movl 4(%esp), %eax
+ lock
+ decw (%eax)
+ ret
+ SET_SIZE(atomic_dec_ushort)
+ SET_SIZE(atomic_dec_16)
+
+ ENTRY(atomic_dec_32)
+ ALTENTRY(atomic_dec_uint)
+ ALTENTRY(atomic_dec_ulong)
+ movl 4(%esp), %eax
+ lock
+ decl (%eax)
+ ret
+ SET_SIZE(atomic_dec_ulong)
+ SET_SIZE(atomic_dec_uint)
+ SET_SIZE(atomic_dec_32)
+
+ ENTRY(atomic_dec_8_nv)
+ ALTENTRY(atomic_dec_uchar_nv)
+ movl 4(%esp), %edx
+ movb (%edx), %al
+1:
+ leal -1(%eax), %ecx
+ lock
+ cmpxchgb %cl, (%edx)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_dec_uchar_nv)
+ SET_SIZE(atomic_dec_8_nv)
+
+ ENTRY(atomic_dec_16_nv)
+ ALTENTRY(atomic_dec_ushort_nv)
+ movl 4(%esp), %edx
+ movw (%edx), %ax
+1:
+ leal -1(%eax), %ecx
+ lock
+ cmpxchgw %cx, (%edx)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_dec_ushort_nv)
+ SET_SIZE(atomic_dec_16_nv)
+
+ ENTRY(atomic_dec_32_nv)
+ ALTENTRY(atomic_dec_uint_nv)
+ ALTENTRY(atomic_dec_ulong_nv)
+ movl 4(%esp), %edx
+ movl (%edx), %eax
+1:
+ leal -1(%eax), %ecx
+ lock
+ cmpxchgl %ecx, (%edx)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_dec_ulong_nv)
+ SET_SIZE(atomic_dec_uint_nv)
+ SET_SIZE(atomic_dec_32_nv)
+
+ ENTRY(atomic_dec_64)
+ ALTENTRY(atomic_dec_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi
+ movl (%edi), %eax
+ movl 4(%edi), %edx
+1:
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ not %ecx
+ not %ebx
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ lock
+ cmpxchg8b (%edi)
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_dec_64_nv)
+ SET_SIZE(atomic_dec_64)
+
+ ENTRY(atomic_or_8)
+ ALTENTRY(atomic_or_uchar)
+ movl 4(%esp), %eax
+ movb 8(%esp), %cl
+ lock
+ orb %cl, (%eax)
+ ret
+ SET_SIZE(atomic_or_uchar)
+ SET_SIZE(atomic_or_8)
+
+ ENTRY(atomic_or_16)
+ ALTENTRY(atomic_or_ushort)
+ movl 4(%esp), %eax
+ movw 8(%esp), %cx
+ lock
+ orw %cx, (%eax)
+ ret
+ SET_SIZE(atomic_or_ushort)
+ SET_SIZE(atomic_or_16)
+
+ ENTRY(atomic_or_32)
+ ALTENTRY(atomic_or_uint)
+ ALTENTRY(atomic_or_ulong)
+ movl 4(%esp), %eax
+ movl 8(%esp), %ecx
+ lock
+ orl %ecx, (%eax)
+ ret
+ SET_SIZE(atomic_or_ulong)
+ SET_SIZE(atomic_or_uint)
+ SET_SIZE(atomic_or_32)
+
+ ENTRY(atomic_and_8)
+ ALTENTRY(atomic_and_uchar)
+ movl 4(%esp), %eax
+ movb 8(%esp), %cl
+ lock
+ andb %cl, (%eax)
+ ret
+ SET_SIZE(atomic_and_uchar)
+ SET_SIZE(atomic_and_8)
+
+ ENTRY(atomic_and_16)
+ ALTENTRY(atomic_and_ushort)
+ movl 4(%esp), %eax
+ movw 8(%esp), %cx
+ lock
+ andw %cx, (%eax)
+ ret
+ SET_SIZE(atomic_and_ushort)
+ SET_SIZE(atomic_and_16)
+
+ ENTRY(atomic_and_32)
+ ALTENTRY(atomic_and_uint)
+ ALTENTRY(atomic_and_ulong)
+ movl 4(%esp), %eax
+ movl 8(%esp), %ecx
+ lock
+ andl %ecx, (%eax)
+ ret
+ SET_SIZE(atomic_and_ulong)
+ SET_SIZE(atomic_and_uint)
+ SET_SIZE(atomic_and_32)
+
+ ENTRY(atomic_add_8_nv)
+ ALTENTRY(atomic_add_char_nv)
+ movl 4(%esp), %edx
+ movb (%edx), %al
+1:
+ movl 8(%esp), %ecx
+ addb %al, %cl
+ lock
+ cmpxchgb %cl, (%edx)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_add_char_nv)
+ SET_SIZE(atomic_add_8_nv)
+
+ ENTRY(atomic_add_16_nv)
+ ALTENTRY(atomic_add_short_nv)
+ movl 4(%esp), %edx
+ movw (%edx), %ax
+1:
+ movl 8(%esp), %ecx
+ addw %ax, %cx
+ lock
+ cmpxchgw %cx, (%edx)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_add_short_nv)
+ SET_SIZE(atomic_add_16_nv)
+
+ ENTRY(atomic_add_32_nv)
+ ALTENTRY(atomic_add_int_nv)
+ ALTENTRY(atomic_add_ptr_nv)
+ ALTENTRY(atomic_add_long_nv)
+ movl 4(%esp), %edx
+ movl (%edx), %eax
+1:
+ movl 8(%esp), %ecx
+ addl %eax, %ecx
+ lock
+ cmpxchgl %ecx, (%edx)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_add_long_nv)
+ SET_SIZE(atomic_add_ptr_nv)
+ SET_SIZE(atomic_add_int_nv)
+ SET_SIZE(atomic_add_32_nv)
+
+ ENTRY(atomic_add_64)
+ ALTENTRY(atomic_add_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi
+ movl (%edi), %eax
+ movl 4(%edi), %edx
+1:
+ movl 16(%esp), %ebx
+ movl 20(%esp), %ecx
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ lock
+ cmpxchg8b (%edi)
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_add_64_nv)
+ SET_SIZE(atomic_add_64)
+
+ ENTRY(atomic_or_8_nv)
+ ALTENTRY(atomic_or_uchar_nv)
+ movl 4(%esp), %edx
+ movb (%edx), %al
+1:
+ movl 8(%esp), %ecx
+ orb %al, %cl
+ lock
+ cmpxchgb %cl, (%edx)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_or_uchar_nv)
+ SET_SIZE(atomic_or_8_nv)
+
+ ENTRY(atomic_or_16_nv)
+ ALTENTRY(atomic_or_ushort_nv)
+ movl 4(%esp), %edx
+ movw (%edx), %ax
+1:
+ movl 8(%esp), %ecx
+ orw %ax, %cx
+ lock
+ cmpxchgw %cx, (%edx)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_or_ushort_nv)
+ SET_SIZE(atomic_or_16_nv)
+
+ ENTRY(atomic_or_32_nv)
+ ALTENTRY(atomic_or_uint_nv)
+ ALTENTRY(atomic_or_ulong_nv)
+ movl 4(%esp), %edx
+ movl (%edx), %eax
+1:
+ movl 8(%esp), %ecx
+ orl %eax, %ecx
+ lock
+ cmpxchgl %ecx, (%edx)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_or_ulong_nv)
+ SET_SIZE(atomic_or_uint_nv)
+ SET_SIZE(atomic_or_32_nv)
+
+ ENTRY(atomic_or_64)
+ ALTENTRY(atomic_or_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi
+ movl (%edi), %eax
+ movl 4(%edi), %edx
+1:
+ movl 16(%esp), %ebx
+ movl 20(%esp), %ecx
+ orl %eax, %ebx
+ orl %edx, %ecx
+ lock
+ cmpxchg8b (%edi)
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_or_64_nv)
+ SET_SIZE(atomic_or_64)
+
+ ENTRY(atomic_and_8_nv)
+ ALTENTRY(atomic_and_uchar_nv)
+ movl 4(%esp), %edx
+ movb (%edx), %al
+1:
+ movl 8(%esp), %ecx
+ andb %al, %cl
+ lock
+ cmpxchgb %cl, (%edx)
+ jne 1b
+ movzbl %cl, %eax
+ ret
+ SET_SIZE(atomic_and_uchar_nv)
+ SET_SIZE(atomic_and_8_nv)
+
+ ENTRY(atomic_and_16_nv)
+ ALTENTRY(atomic_and_ushort_nv)
+ movl 4(%esp), %edx
+ movw (%edx), %ax
+1:
+ movl 8(%esp), %ecx
+ andw %ax, %cx
+ lock
+ cmpxchgw %cx, (%edx)
+ jne 1b
+ movzwl %cx, %eax
+ ret
+ SET_SIZE(atomic_and_ushort_nv)
+ SET_SIZE(atomic_and_16_nv)
+
+ ENTRY(atomic_and_32_nv)
+ ALTENTRY(atomic_and_uint_nv)
+ ALTENTRY(atomic_and_ulong_nv)
+ movl 4(%esp), %edx
+ movl (%edx), %eax
+1:
+ movl 8(%esp), %ecx
+ andl %eax, %ecx
+ lock
+ cmpxchgl %ecx, (%edx)
+ jne 1b
+ movl %ecx, %eax
+ ret
+ SET_SIZE(atomic_and_ulong_nv)
+ SET_SIZE(atomic_and_uint_nv)
+ SET_SIZE(atomic_and_32_nv)
+
+ ENTRY(atomic_and_64)
+ ALTENTRY(atomic_and_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi
+ movl (%edi), %eax
+ movl 4(%edi), %edx
+1:
+ movl 16(%esp), %ebx
+ movl 20(%esp), %ecx
+ andl %eax, %ebx
+ andl %edx, %ecx
+ lock
+ cmpxchg8b (%edi)
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_and_64_nv)
+ SET_SIZE(atomic_and_64)
+
+ ENTRY(atomic_cas_8)
+ ALTENTRY(atomic_cas_uchar)
+ movl 4(%esp), %edx
+ movzbl 8(%esp), %eax
+ movb 12(%esp), %cl
+ lock
+ cmpxchgb %cl, (%edx)
+ ret
+ SET_SIZE(atomic_cas_uchar)
+ SET_SIZE(atomic_cas_8)
+
+ ENTRY(atomic_cas_16)
+ ALTENTRY(atomic_cas_ushort)
+ movl 4(%esp), %edx
+ movzwl 8(%esp), %eax
+ movw 12(%esp), %cx
+ lock
+ cmpxchgw %cx, (%edx)
+ ret
+ SET_SIZE(atomic_cas_ushort)
+ SET_SIZE(atomic_cas_16)
+
+ ENTRY(atomic_cas_32)
+ ALTENTRY(atomic_cas_uint)
+ ALTENTRY(atomic_cas_ulong)
+ ALTENTRY(atomic_cas_ptr)
+ movl 4(%esp), %edx
+ movl 8(%esp), %eax
+ movl 12(%esp), %ecx
+ lock
+ cmpxchgl %ecx, (%edx)
+ ret
+ SET_SIZE(atomic_cas_ptr)
+ SET_SIZE(atomic_cas_ulong)
+ SET_SIZE(atomic_cas_uint)
+ SET_SIZE(atomic_cas_32)
+
+ ENTRY(atomic_cas_64)
+ pushl %ebx
+ pushl %esi
+ movl 12(%esp), %esi
+ movl 16(%esp), %eax
+ movl 20(%esp), %edx
+ movl 24(%esp), %ebx
+ movl 28(%esp), %ecx
+ lock
+ cmpxchg8b (%esi)
+ popl %esi
+ popl %ebx
+ ret
+ SET_SIZE(atomic_cas_64)
+
+ ENTRY(atomic_swap_8)
+ ALTENTRY(atomic_swap_uchar)
+ movl 4(%esp), %edx
+ movzbl 8(%esp), %eax
+ lock
+ xchgb %al, (%edx)
+ ret
+ SET_SIZE(atomic_swap_uchar)
+ SET_SIZE(atomic_swap_8)
+
+ ENTRY(atomic_swap_16)
+ ALTENTRY(atomic_swap_ushort)
+ movl 4(%esp), %edx
+ movzwl 8(%esp), %eax
+ lock
+ xchgw %ax, (%edx)
+ ret
+ SET_SIZE(atomic_swap_ushort)
+ SET_SIZE(atomic_swap_16)
+
+ ENTRY(atomic_swap_32)
+ ALTENTRY(atomic_swap_uint)
+ ALTENTRY(atomic_swap_ptr)
+ ALTENTRY(atomic_swap_ulong)
+ movl 4(%esp), %edx
+ movl 8(%esp), %eax
+ lock
+ xchgl %eax, (%edx)
+ ret
+ SET_SIZE(atomic_swap_ulong)
+ SET_SIZE(atomic_swap_ptr)
+ SET_SIZE(atomic_swap_uint)
+ SET_SIZE(atomic_swap_32)
+
+ ENTRY(atomic_swap_64)
+ pushl %esi
+ pushl %ebx
+ movl 12(%esp), %esi
+ movl 16(%esp), %ebx
+ movl 20(%esp), %ecx
+ movl (%esi), %eax
+ movl 4(%esi), %edx
+1:
+ lock
+ cmpxchg8b (%esi)
+ jne 1b
+ popl %ebx
+ popl %esi
+ ret
+ SET_SIZE(atomic_swap_64)
+
+ ENTRY(atomic_set_long_excl)
+ movl 4(%esp), %edx
+ movl 8(%esp), %ecx
+ xorl %eax, %eax
+ lock
+ btsl %ecx, (%edx)
+ jnc 1f
+ decl %eax
+1:
+ ret
+ SET_SIZE(atomic_set_long_excl)
+
+ ENTRY(atomic_clear_long_excl)
+ movl 4(%esp), %edx
+ movl 8(%esp), %ecx
+ xorl %eax, %eax
+ lock
+ btrl %ecx, (%edx)
+ jc 1f
+ decl %eax
+1:
+ ret
+ SET_SIZE(atomic_clear_long_excl)
+
+ ENTRY(membar_enter)
+ ALTENTRY(membar_exit)
+ ALTENTRY(membar_producer)
+ ALTENTRY(membar_consumer)
+ lock
+ xorl $0, (%esp)
+ ret
+ SET_SIZE(membar_consumer)
+ SET_SIZE(membar_producer)
+ SET_SIZE(membar_exit)
+ SET_SIZE(membar_enter)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/dnlc.c b/sys/contrib/opensolaris/uts/common/fs/dnlc.c
new file mode 100644
index 000000000000..48daf369b1db
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/dnlc.c
@@ -0,0 +1,824 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/dnlc.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+#include <sys/taskq.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+
+#define TRACE_0(...) do { } while (0)
+#define TRACE_2(...) do { } while (0)
+#define TRACE_4(...) do { } while (0)
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dnlc, CTLFLAG_RW, 0, "ZFS namecache");
+
+/*
+ * Directory name lookup cache.
+ * Based on code originally done by Robert Elz at Melbourne.
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference. Each hash chain is ordered by LRU
+ * Cache is indexed by hash value obtained from (vp, name)
+ * where the vp refers to the directory containing the name.
+ */
+
+/*
+ * Tunable nc_hashavelen is the average length desired for this chain, from
+ * which the size of the nc_hash table is derived at create time.
+ */
+#define NC_HASHAVELEN_DEFAULT 4
+int nc_hashavelen = NC_HASHAVELEN_DEFAULT;
+
+/*
+ * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup
+ * depth exceeds this value, we move the looked-up entry to the front of
+ * its hash chain. The idea is to make sure that the most frequently
+ * accessed entries are found most quickly (by keeping them near the
+ * front of their hash chains).
+ */
+#define NC_MOVETOFRONT 2
+
+/*
+ *
+ * DNLC_MAX_RELE is used to size an array on the stack when releasing
+ * vnodes. This array is used rather than calling VN_RELE() inline because
+ * all dnlc locks must be dropped by that time in order to avoid a
+ * possible deadlock. This deadlock occurs when the dnlc holds the last
+ * reference to the vnode and so the VOP_INACTIVE vector is called which
+ * can in turn call back into the dnlc. A global array was used but had
+ * many problems:
+ * 1) Actually doesn't have an upper bound on the array size as
+ * entries can be added after starting the purge.
+ * 2) The locking scheme causes a hang.
+ * 3) Caused serialisation on the global lock.
+ * 4) The array was often unnecessarily huge.
+ *
+ * Note the current value 8 allows up to 4 cache entries (to be purged
+ * from each hash chain), before having to cycle around and retry.
+ * This ought to be ample given that nc_hashavelen is typically very small.
+ */
+#define DNLC_MAX_RELE 8 /* must be even */
+
+/*
+ * Hash table of name cache entries for fast lookup, dynamically
+ * allocated at startup.
+ */
+nc_hash_t *nc_hash;
+
+/*
+ * Rotors. Used to select entries on a round-robin basis.
+ */
+static nc_hash_t *dnlc_purge_fs1_rotor;
+static nc_hash_t *dnlc_free_rotor;
+
+/*
+ * # of dnlc entries (uninitialized)
+ *
+ * the initial value was chosen as being
+ * a random string of bits, probably not
+ * normally chosen by a systems administrator
+ */
+int ncsize = -1;
+TUNABLE_INT("vfs.zfs.dnlc.ncsize", &ncsize);
+SYSCTL_INT(_vfs_zfs_dnlc, OID_AUTO, ncsize, CTLFLAG_RDTUN, &ncsize, 0,
+ "Number of DNLC entries");
+volatile uint32_t dnlc_nentries = 0; /* current num of name cache entries */
+SYSCTL_UINT(_vfs_zfs_dnlc, OID_AUTO, nentries, CTLFLAG_RD,
+ __DEVOLATILE(u_int *, &dnlc_nentries), 0, "Number of DNLC entries in use");
+static int nc_hashsz; /* size of hash table */
+static int nc_hashmask; /* size of hash table minus 1 */
+
+/*
+ * The dnlc_reduce_cache() taskq queue is activated when there are
+ * ncsize name cache entries and if no parameter is provided, it reduces
+ * the size down to dnlc_nentries_low_water, which is by default one
+ * hundreth less (or 99%) of ncsize.
+ *
+ * If a parameter is provided to dnlc_reduce_cache(), then we reduce
+ * the size down based on ncsize_onepercent - where ncsize_onepercent
+ * is 1% of ncsize; however, we never let dnlc_reduce_cache() reduce
+ * the size below 3% of ncsize (ncsize_min_percent).
+ */
+#define DNLC_LOW_WATER_DIVISOR_DEFAULT 100
+uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT;
+uint_t dnlc_nentries_low_water;
+int dnlc_reduce_idle = 1; /* no locking needed */
+uint_t ncsize_onepercent;
+uint_t ncsize_min_percent;
+
+/*
+ * If dnlc_nentries hits dnlc_max_nentries (twice ncsize)
+ * then this means the dnlc_reduce_cache() taskq is failing to
+ * keep up. In this case we refuse to add new entries to the dnlc
+ * until the taskq catches up.
+ */
+uint_t dnlc_max_nentries; /* twice ncsize */
+SYSCTL_UINT(_vfs_zfs_dnlc, OID_AUTO, max_nentries, CTLFLAG_RD,
+ &dnlc_max_nentries, 0, "Maximum number of DNLC entries");
+uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */
+
+/*
+ * Tunable to define when we should just remove items from
+ * the end of the chain.
+ */
+#define DNLC_LONG_CHAIN 8
+uint_t dnlc_long_chain = DNLC_LONG_CHAIN;
+
+/*
+ * ncstats has been deprecated, due to the integer size of the counters
+ * which can easily overflow in the dnlc.
+ * It is maintained (at some expense) for compatability.
+ * The preferred interface is the kstat accessible nc_stats below.
+ */
+struct ncstats ncstats;
+
+struct nc_stats ncs = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "negative_cache_hits", KSTAT_DATA_UINT64 },
+ { "enters", KSTAT_DATA_UINT64 },
+ { "double_enters", KSTAT_DATA_UINT64 },
+ { "purge_total_entries", KSTAT_DATA_UINT64 },
+ { "purge_all", KSTAT_DATA_UINT64 },
+ { "purge_vp", KSTAT_DATA_UINT64 },
+ { "purge_vfs", KSTAT_DATA_UINT64 },
+ { "purge_fs1", KSTAT_DATA_UINT64 },
+ { "pick_free", KSTAT_DATA_UINT64 },
+ { "pick_heuristic", KSTAT_DATA_UINT64 },
+ { "pick_last", KSTAT_DATA_UINT64 },
+};
+
+static int doingcache = 1;
+TUNABLE_INT("vfs.zfs.dnlc.enable", &doingcache);
+SYSCTL_INT(_vfs_zfs_dnlc, OID_AUTO, enable, CTLFLAG_RDTUN, &doingcache, 0,
+ "Enable/disable name cache");
+
+vnode_t negative_cache_vnode;
+
+/*
+ * Insert entry at the front of the queue
+ */
+#define nc_inshash(ncp, hp) \
+{ \
+ (ncp)->hash_next = (hp)->hash_next; \
+ (ncp)->hash_prev = (ncache_t *)(hp); \
+ (hp)->hash_next->hash_prev = (ncp); \
+ (hp)->hash_next = (ncp); \
+}
+
+/*
+ * Remove entry from hash queue
+ */
+#define nc_rmhash(ncp) \
+{ \
+ (ncp)->hash_prev->hash_next = (ncp)->hash_next; \
+ (ncp)->hash_next->hash_prev = (ncp)->hash_prev; \
+ (ncp)->hash_prev = NULL; \
+ (ncp)->hash_next = NULL; \
+}
+
+/*
+ * Free an entry.
+ */
+#define dnlc_free(ncp) \
+{ \
+ kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \
+ atomic_add_32(&dnlc_nentries, -1); \
+}
+
+
+/*
+ * Cached directory info.
+ * ======================
+ */
+
+/*
+ * Cached directory free space hash function.
+ * Needs the free space handle and the dcp to get the hash table size
+ * Returns the hash index.
+ */
+#define DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask)
+
+/*
+ * Cached directory name entry hash function.
+ * Uses the name and returns in the input arguments the hash and the name
+ * length.
+ */
+#define DNLC_DIR_HASH(name, hash, namelen) \
+ { \
+ char Xc, *Xcp; \
+ hash = *name; \
+ for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \
+ hash = (hash << 4) + hash + Xc; \
+ ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \
+ namelen = Xcp - (name); \
+ }
+
+/* special dircache_t pointer to indicate error should be returned */
+/*
+ * The anchor directory cache pointer can contain 3 types of values,
+ * 1) NULL: No directory cache
+ * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be
+ * too big or a memory shortage occurred. This value remains in the
+ * pointer until a dnlc_dir_start() which returns the a DNOMEM error.
+ * This is kludgy but efficient and only visible in this source file.
+ * 3) A valid cache pointer.
+ */
+#define DC_RET_LOW_MEM (dircache_t *)1
+#define VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM)
+
+/* Prototypes */
+static ncache_t *dnlc_get(uchar_t namlen);
+static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash);
+static void do_dnlc_reduce_cache(void *);
+
+static kstat_t *dnlc_ksp = NULL;
+
+/*
+ * Initialize the directory cache.
+ */
+static void
+dnlc_init(void *arg __unused)
+{
+ nc_hash_t *hp;
+ int i;
+
+ /*
+ * Set up the size of the dnlc (ncsize) and its low water mark.
+ */
+ if (ncsize == -1) {
+ /* calculate a reasonable size for the low water */
+#if 0
+ dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320;
+#else
+ dnlc_nentries_low_water = vm_kmem_size / 20480;
+#endif
+ ncsize = dnlc_nentries_low_water +
+ (dnlc_nentries_low_water / dnlc_low_water_divisor);
+ } else {
+ /* don't change the user specified ncsize */
+ dnlc_nentries_low_water =
+ ncsize - (ncsize / dnlc_low_water_divisor);
+ }
+ if (ncsize <= 0) {
+ doingcache = 0;
+ ncsize = 0;
+ cmn_err(CE_NOTE, "name cache (dnlc) disabled");
+ return;
+ }
+ dnlc_max_nentries = ncsize * 2;
+ ncsize_onepercent = ncsize / 100;
+ ncsize_min_percent = ncsize_onepercent * 3;
+
+ /*
+ * Initialise the hash table.
+ * Compute hash size rounding to the next power of two.
+ */
+ nc_hashsz = ncsize / nc_hashavelen;
+ nc_hashsz = 1 << highbit(nc_hashsz);
+ nc_hashmask = nc_hashsz - 1;
+ nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP);
+ for (i = 0; i < nc_hashsz; i++) {
+ hp = (nc_hash_t *)&nc_hash[i];
+ mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL);
+ hp->hash_next = (ncache_t *)hp;
+ hp->hash_prev = (ncache_t *)hp;
+ }
+
+ /*
+ * Initialize rotors
+ */
+ dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0];
+
+ /*
+ * Initialise the reference count of the negative cache vnode to 1
+ * so that it never goes away (VOP_INACTIVE isn't called on it).
+ */
+ negative_cache_vnode.v_count = 1;
+ negative_cache_vnode.v_holdcnt = 1;
+ mtx_init(&negative_cache_vnode.v_interlock, "vnode interlock", NULL,
+ MTX_DEF);
+
+ /*
+ * Initialise kstats - both the old compatability raw kind and
+ * the more extensive named stats.
+ */
+ dnlc_ksp = kstat_create("zfs", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (dnlc_ksp) {
+ dnlc_ksp->ks_data = (void *) &ncs;
+ kstat_install(dnlc_ksp);
+ }
+}
+
+static void
+dnlc_fini(void *arg __unused)
+{
+ nc_hash_t *hp;
+ int i;
+
+ if (dnlc_ksp != NULL) {
+ kstat_delete(dnlc_ksp);
+ dnlc_ksp = NULL;
+ }
+
+ mtx_destroy(&negative_cache_vnode.v_interlock);
+
+ for (i = 0; i < nc_hashsz; i++) {
+ hp = (nc_hash_t *)&nc_hash[i];
+ mutex_destroy(&hp->hash_lock);
+ }
+ kmem_free(nc_hash, nc_hashsz * sizeof (*nc_hash));
+}
+
+/*
+ * Add a name to the directory cache.
+ *
+ * This function is basically identical with
+ * dnlc_enter(). The difference is that when the
+ * desired dnlc entry is found, the vnode in the
+ * ncache is compared with the vnode passed in.
+ *
+ * If they are not equal then the ncache is
+ * updated with the passed in vnode. Otherwise
+ * it just frees up the newly allocated dnlc entry.
+ */
+void
+dnlc_update(vnode_t *dp, char *name, vnode_t *vp)
+{
+ ncache_t *ncp;
+ ncache_t *tcp;
+ vnode_t *tvp;
+ nc_hash_t *hp;
+ int hash;
+ uchar_t namlen;
+
+ TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:");
+
+ if (!doingcache) {
+ TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
+ "dnlc_update_end:(%S) %d", "not caching", 0);
+ return;
+ }
+
+ /*
+ * Get a new dnlc entry and initialize it now.
+ * If we fail to get a new entry, call dnlc_remove() to purge
+ * any existing dnlc entry including negative cache (DNLC_NO_VNODE)
+ * entry.
+ * Failure to clear an existing entry could result in false dnlc
+ * lookup (negative/stale entry).
+ */
+ DNLCHASH(name, dp, hash, namlen);
+ if ((ncp = dnlc_get(namlen)) == NULL) {
+ dnlc_remove(dp, name);
+ return;
+ }
+ ncp->dp = dp;
+ VN_HOLD(dp);
+ ncp->vp = vp;
+ VN_HOLD(vp);
+ bcopy(name, ncp->name, namlen + 1); /* name and null */
+ ncp->hash = hash;
+ hp = &nc_hash[hash & nc_hashmask];
+
+ mutex_enter(&hp->hash_lock);
+ if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) {
+ if (tcp->vp != vp) {
+ tvp = tcp->vp;
+ tcp->vp = vp;
+ mutex_exit(&hp->hash_lock);
+ VN_RELE(tvp);
+ ncstats.enters++;
+ ncs.ncs_enters.value.ui64++;
+ TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
+ "dnlc_update_end:(%S) %d", "done", ncstats.enters);
+ } else {
+ mutex_exit(&hp->hash_lock);
+ VN_RELE(vp);
+ ncstats.dbl_enters++;
+ ncs.ncs_dbl_enters.value.ui64++;
+ TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
+ "dnlc_update_end:(%S) %d",
+ "dbl enter", ncstats.dbl_enters);
+ }
+ VN_RELE(dp);
+ dnlc_free(ncp); /* crfree done here */
+ return;
+ }
+ /*
+ * insert the new entry, since it is not in dnlc yet
+ */
+ nc_inshash(ncp, hp);
+ mutex_exit(&hp->hash_lock);
+ ncstats.enters++;
+ ncs.ncs_enters.value.ui64++;
+ TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
+ "dnlc_update_end:(%S) %d", "done", ncstats.enters);
+}
+
+/*
+ * Look up a name in the directory name cache.
+ *
+ * Return a doubly-held vnode if found: one hold so that it may
+ * remain in the cache for other users, the other hold so that
+ * the cache is not re-cycled and the identity of the vnode is
+ * lost before the caller can use the vnode.
+ */
+vnode_t *
+dnlc_lookup(vnode_t *dp, char *name)
+{
+ ncache_t *ncp;
+ nc_hash_t *hp;
+ vnode_t *vp;
+ int hash, depth;
+ uchar_t namlen;
+
+ TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START,
+ "dnlc_lookup_start:dp %x name %s", dp, name);
+
+ if (!doingcache) {
+ TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
+ "dnlc_lookup_end:%S %d vp %x name %s",
+ "not_caching", 0, NULL, name);
+ return (NULL);
+ }
+
+ DNLCHASH(name, dp, hash, namlen);
+ depth = 1;
+ hp = &nc_hash[hash & nc_hashmask];
+ mutex_enter(&hp->hash_lock);
+
+ for (ncp = hp->hash_next; ncp != (ncache_t *)hp;
+ ncp = ncp->hash_next) {
+ if (ncp->hash == hash && /* fast signature check */
+ ncp->dp == dp &&
+ ncp->namlen == namlen &&
+ bcmp(ncp->name, name, namlen) == 0) {
+ /*
+ * Move this entry to the head of its hash chain
+ * if it's not already close.
+ */
+ if (depth > NC_MOVETOFRONT) {
+ ncache_t *next = ncp->hash_next;
+ ncache_t *prev = ncp->hash_prev;
+
+ prev->hash_next = next;
+ next->hash_prev = prev;
+ ncp->hash_next = next = hp->hash_next;
+ ncp->hash_prev = (ncache_t *)hp;
+ next->hash_prev = ncp;
+ hp->hash_next = ncp;
+
+ ncstats.move_to_front++;
+ }
+
+ /*
+ * Put a hold on the vnode now so its identity
+ * can't change before the caller has a chance to
+ * put a hold on it.
+ */
+ vp = ncp->vp;
+ VN_HOLD(vp);
+ mutex_exit(&hp->hash_lock);
+ ncstats.hits++;
+ ncs.ncs_hits.value.ui64++;
+ if (vp == DNLC_NO_VNODE) {
+ ncs.ncs_neg_hits.value.ui64++;
+ }
+ TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
+ "dnlc_lookup_end:%S %d vp %x name %s",
+ "hit", ncstats.hits, vp, name);
+ return (vp);
+ }
+ depth++;
+ }
+
+ mutex_exit(&hp->hash_lock);
+ ncstats.misses++;
+ ncs.ncs_misses.value.ui64++;
+ TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
+ "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses,
+ NULL, name);
+ return (NULL);
+}
+
+/*
+ * Remove an entry in the directory name cache.
+ */
+void
+dnlc_remove(vnode_t *dp, char *name)
+{
+ ncache_t *ncp;
+ nc_hash_t *hp;
+ uchar_t namlen;
+ int hash;
+
+ if (!doingcache)
+ return;
+ DNLCHASH(name, dp, hash, namlen);
+ hp = &nc_hash[hash & nc_hashmask];
+
+ mutex_enter(&hp->hash_lock);
+ if (ncp = dnlc_search(dp, name, namlen, hash)) {
+ /*
+ * Free up the entry
+ */
+ nc_rmhash(ncp);
+ mutex_exit(&hp->hash_lock);
+ VN_RELE(ncp->vp);
+ VN_RELE(ncp->dp);
+ dnlc_free(ncp);
+ return;
+ }
+ mutex_exit(&hp->hash_lock);
+}
+
+/*
+ * Purge cache entries referencing a vfsp. Caller supplies a count
+ * of entries to purge; up to that many will be freed. A count of
+ * zero indicates that all such entries should be purged. Returns
+ * the number of entries that were purged.
+ */
+int
+dnlc_purge_vfsp(vfs_t *vfsp, int count)
+{
+ nc_hash_t *nch;
+ ncache_t *ncp;
+ int n = 0;
+ int index;
+ int i;
+ vnode_t *nc_rele[DNLC_MAX_RELE];
+
+ if (!doingcache)
+ return (0);
+
+ ncstats.purges++;
+ ncs.ncs_purge_vfs.value.ui64++;
+
+ for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
+ index = 0;
+ mutex_enter(&nch->hash_lock);
+ ncp = nch->hash_next;
+ while (ncp != (ncache_t *)nch) {
+ ncache_t *np;
+
+ np = ncp->hash_next;
+ ASSERT(ncp->dp != NULL);
+ ASSERT(ncp->vp != NULL);
+ if ((ncp->dp->v_vfsp == vfsp) ||
+ (ncp->vp->v_vfsp == vfsp)) {
+ n++;
+ nc_rele[index++] = ncp->vp;
+ nc_rele[index++] = ncp->dp;
+ nc_rmhash(ncp);
+ dnlc_free(ncp);
+ ncs.ncs_purge_total.value.ui64++;
+ if (index == DNLC_MAX_RELE) {
+ ncp = np;
+ break;
+ }
+ if (count != 0 && n >= count) {
+ break;
+ }
+ }
+ ncp = np;
+ }
+ mutex_exit(&nch->hash_lock);
+ /* Release holds on all the vnodes now that we have no locks */
+ for (i = 0; i < index; i++) {
+ VN_RELE(nc_rele[i]);
+ }
+ if (count != 0 && n >= count) {
+ return (n);
+ }
+ if (ncp != (ncache_t *)nch) {
+ nch--; /* Do current hash chain again */
+ }
+ }
+ return (n);
+}
+
+/*
+ * Utility routine to search for a cache entry. Return the
+ * ncache entry if found, NULL otherwise.
+ */
+static ncache_t *
+dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash)
+{
+ nc_hash_t *hp;
+ ncache_t *ncp;
+
+ hp = &nc_hash[hash & nc_hashmask];
+
+ for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) {
+ if (ncp->hash == hash &&
+ ncp->dp == dp &&
+ ncp->namlen == namlen &&
+ bcmp(ncp->name, name, namlen) == 0)
+ return (ncp);
+ }
+ return (NULL);
+}
+
+#if ((1 << NBBY) - 1) < (MAXNAMELEN - 1)
+#error ncache_t name length representation is too small
+#endif
+
+void
+dnlc_reduce_cache(void *reduce_percent)
+{
+ if (dnlc_reduce_idle && (dnlc_nentries >= ncsize || reduce_percent)) {
+ dnlc_reduce_idle = 0;
+ if ((taskq_dispatch(system_taskq, do_dnlc_reduce_cache,
+ reduce_percent, TQ_NOSLEEP)) == 0)
+ dnlc_reduce_idle = 1;
+ }
+}
+
+/*
+ * Get a new name cache entry.
+ * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory
+ * is short then just return NULL. If we're over ncsize then kick off a
+ * thread to free some in use entries down to dnlc_nentries_low_water.
+ * Caller must initialise all fields except namlen.
+ * Component names are defined to be less than MAXNAMELEN
+ * which includes a null.
+ */
+static ncache_t *
+dnlc_get(uchar_t namlen)
+{
+ ncache_t *ncp;
+
+ if (dnlc_nentries > dnlc_max_nentries) {
+ dnlc_max_nentries_cnt++; /* keep a statistic */
+ return (NULL);
+ }
+ ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP);
+ if (ncp == NULL) {
+ return (NULL);
+ }
+ ncp->namlen = namlen;
+ atomic_add_32(&dnlc_nentries, 1);
+ dnlc_reduce_cache(NULL);
+ return (ncp);
+}
+
+/*
+ * Taskq routine to free up name cache entries to reduce the
+ * cache size to the low water mark if "reduce_percent" is not provided.
+ * If "reduce_percent" is provided, reduce cache size by
+ * (ncsize_onepercent * reduce_percent).
+ */
+/*ARGSUSED*/
+static void
+do_dnlc_reduce_cache(void *reduce_percent)
+{
+ nc_hash_t *hp = dnlc_free_rotor, *start_hp = hp;
+ vnode_t *vp;
+ ncache_t *ncp;
+ int cnt;
+ uint_t low_water = dnlc_nentries_low_water;
+
+ if (reduce_percent) {
+ uint_t reduce_cnt;
+
+ /*
+ * Never try to reduce the current number
+ * of cache entries below 3% of ncsize.
+ */
+ if (dnlc_nentries <= ncsize_min_percent) {
+ dnlc_reduce_idle = 1;
+ return;
+ }
+ reduce_cnt = ncsize_onepercent *
+ (uint_t)(uintptr_t)reduce_percent;
+
+ if (reduce_cnt > dnlc_nentries ||
+ dnlc_nentries - reduce_cnt < ncsize_min_percent)
+ low_water = ncsize_min_percent;
+ else
+ low_water = dnlc_nentries - reduce_cnt;
+ }
+
+ do {
+ /*
+ * Find the first non empty hash queue without locking.
+ * Only look at each hash queue once to avoid an infinite loop.
+ */
+ do {
+ if (++hp == &nc_hash[nc_hashsz])
+ hp = nc_hash;
+ } while (hp->hash_next == (ncache_t *)hp && hp != start_hp);
+
+ /* return if all hash queues are empty. */
+ if (hp->hash_next == (ncache_t *)hp) {
+ dnlc_reduce_idle = 1;
+ return;
+ }
+
+ mutex_enter(&hp->hash_lock);
+ for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp;
+ ncp = ncp->hash_prev, cnt++) {
+ vp = ncp->vp;
+ /*
+ * A name cache entry with a reference count
+ * of one is only referenced by the dnlc.
+ * Also negative cache entries are purged first.
+ */
+ if (!vn_has_cached_data(vp) &&
+ ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) {
+ ncs.ncs_pick_heur.value.ui64++;
+ goto found;
+ }
+ /*
+ * Remove from the end of the chain if the
+ * chain is too long
+ */
+ if (cnt > dnlc_long_chain) {
+ ncp = hp->hash_prev;
+ ncs.ncs_pick_last.value.ui64++;
+ vp = ncp->vp;
+ goto found;
+ }
+ }
+ /* check for race and continue */
+ if (hp->hash_next == (ncache_t *)hp) {
+ mutex_exit(&hp->hash_lock);
+ continue;
+ }
+
+ ncp = hp->hash_prev; /* pick the last one in the hash queue */
+ ncs.ncs_pick_last.value.ui64++;
+ vp = ncp->vp;
+found:
+ /*
+ * Remove from hash chain.
+ */
+ nc_rmhash(ncp);
+ mutex_exit(&hp->hash_lock);
+ VN_RELE(vp);
+ VN_RELE(ncp->dp);
+ dnlc_free(ncp);
+ } while (dnlc_nentries > low_water);
+
+ dnlc_free_rotor = hp;
+ dnlc_reduce_idle = 1;
+}
+
+SYSINIT(dnlc, SI_SUB_DRIVERS, SI_ORDER_ANY, dnlc_init, NULL);
+SYSUNINIT(dnlc, SI_SUB_DRIVERS, SI_ORDER_ANY, dnlc_fini, NULL);
diff --git a/sys/contrib/opensolaris/uts/common/fs/gfs.c b/sys/contrib/opensolaris/uts/common/fs/gfs.c
new file mode 100644
index 000000000000..7e6d6891fa55
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/gfs.c
@@ -0,0 +1,882 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2007 Shivakumar GN */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/kmem.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/cred.h>
+#include <sys/kdb.h>
+
+#include <sys/gfs.h>
+
+/*
+ * Generic pseudo-filesystem routines.
+ *
+ * There are significant similarities between the implementation of certain file
+ * system entry points across different filesystems. While one could attempt to
+ * "choke up on the bat" and incorporate common functionality into a VOP
+ * preamble or postamble, such an approach is limited in the benefit it can
+ * provide. In this file we instead define a toolkit of routines which can be
+ * called from a filesystem (with in-kernel pseudo-filesystems being the focus
+ * of the exercise) in a more component-like fashion.
+ *
+ * There are three basic classes of routines:
+ *
+ * 1) Lowlevel support routines
+ *
+ * These routines are designed to play a support role for existing
+ * pseudo-filesystems (such as procfs). They simplify common tasks,
+ * without enforcing the filesystem to hand over management to GFS. The
+ * routines covered are:
+ *
+ * gfs_readdir_init()
+ * gfs_readdir_emit()
+ * gfs_readdir_emitn()
+ * gfs_readdir_pred()
+ * gfs_readdir_fini()
+ * gfs_lookup_dot()
+ *
+ * 2) Complete GFS management
+ *
+ * These routines take a more active role in management of the
+ * pseudo-filesystem. They handle the relationship between vnode private
+ * data and VFS data, as well as the relationship between vnodes in the
+ * directory hierarchy.
+ *
+ * In order to use these interfaces, the first member of every private
+ * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control
+ * to GFS.
+ *
+ * gfs_file_create()
+ * gfs_dir_create()
+ * gfs_root_create()
+ *
+ * gfs_file_inactive()
+ * gfs_dir_inactive()
+ * gfs_dir_lookup()
+ * gfs_dir_readdir()
+ *
+ * gfs_vop_inactive()
+ * gfs_vop_lookup()
+ * gfs_vop_readdir()
+ * gfs_vop_map()
+ *
+ * 3) Single File pseudo-filesystems
+ *
+ * This routine creates a rooted file to be overlayed ontop of another
+ * file in the physical filespace.
+ *
+ * Note that the parent is NULL (actually the vfs), but there is nothing
+ * technically keeping such a file from utilizing the "Complete GFS
+ * management" set of routines.
+ *
+ * gfs_root_create_file()
+ */
+
+/*
+ * Low level directory routines
+ *
+ * These routines provide some simple abstractions for reading directories.
+ * They are designed to be used by existing pseudo filesystems (namely procfs)
+ * that already have a complicated management infrastructure.
+ */
+
+/*
+ * gfs_readdir_init: initiate a generic readdir
+ * st - a pointer to an uninitialized gfs_readdir_state_t structure
+ * name_max - the directory's maximum file name length
+ * ureclen - the exported file-space record length (1 for non-legacy FSs)
+ * uiop - the uiop passed to readdir
+ * parent - the parent directory's inode
+ * self - this directory's inode
+ *
+ * Returns 0 or a non-zero errno.
+ *
+ * Typical VOP_READDIR usage of gfs_readdir_*:
+ *
+ * if ((error = gfs_readdir_init(...)) != 0)
+ * return (error);
+ * eof = 0;
+ * while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
+ * if (!consumer_entry_at(voffset))
+ * voffset = consumer_next_entry(voffset);
+ * if (consumer_eof(voffset)) {
+ * eof = 1
+ * break;
+ * }
+ * if ((error = gfs_readdir_emit(..., voffset,
+ * consumer_ino(voffset), consumer_name(voffset))) != 0)
+ * break;
+ * }
+ * return (gfs_readdir_fini(..., error, eofp, eof));
+ *
+ * As you can see, a zero result from gfs_readdir_pred() or
+ * gfs_readdir_emit() indicates that processing should continue,
+ * whereas a non-zero result indicates that the loop should terminate.
+ * Most consumers need do nothing more than let gfs_readdir_fini()
+ * determine what the cause of failure was and return the appropriate
+ * value.
+ */
+int
+gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
+ uio_t *uiop, ino64_t parent, ino64_t self)
+{
+ if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
+ (uiop->uio_loffset % ureclen) != 0)
+ return (EINVAL);
+
+ st->grd_ureclen = ureclen;
+ st->grd_oresid = uiop->uio_resid;
+ st->grd_namlen = name_max;
+ st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
+ st->grd_parent = parent;
+ st->grd_self = self;
+
+ return (0);
+}
+
+/*
+ * gfs_readdir_emit_int: internal routine to emit directory entry
+ *
+ * st - the current readdir state, which must have d_ino and d_name
+ * set
+ * uiop - caller-supplied uio pointer
+ * next - the offset of the next entry
+ */
+static int
+gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
+ int *ncookies, u_long **cookies)
+{
+ int reclen, namlen;
+
+ namlen = strlen(st->grd_dirent->d_name);
+ reclen = DIRENT64_RECLEN(namlen);
+
+ if (reclen > uiop->uio_resid) {
+ /*
+ * Error if no entries were returned yet
+ */
+ if (uiop->uio_resid == st->grd_oresid)
+ return (EINVAL);
+ return (-1);
+ }
+
+ st->grd_dirent->d_reclen = (ushort_t)reclen;
+ st->grd_dirent->d_namlen = namlen;
+
+ if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
+ return (EFAULT);
+
+ uiop->uio_loffset = next;
+ if (*cookies != NULL) {
+ **cookies = next;
+ (*cookies)++;
+ (*ncookies)--;
+ KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies));
+ }
+
+ return (0);
+}
+
+/*
+ * gfs_readdir_emit: emit a directory entry
+ * voff - the virtual offset (obtained from gfs_readdir_pred)
+ * ino - the entry's inode
+ * name - the entry's name
+ *
+ * Returns a 0 on success, a non-zero errno on failure, or -1 if the
+ * readdir loop should terminate. A non-zero result (either errno or
+ * -1) from this function is typically passed directly to
+ * gfs_readdir_fini().
+ */
+int
+gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
+ ino64_t ino, const char *name, int *ncookies, u_long **cookies)
+{
+ offset_t off = (voff + 2) * st->grd_ureclen;
+
+ st->grd_dirent->d_ino = ino;
+ (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
+
+ /*
+ * Inter-entry offsets are invalid, so we assume a record size of
+ * grd_ureclen and explicitly set the offset appropriately.
+ */
+ return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies,
+ cookies));
+}
+
+/*
+ * gfs_readdir_pred: readdir loop predicate
+ * voffp - a pointer in which the next virtual offset should be stored
+ *
+ * Returns a 0 on success, a non-zero errno on failure, or -1 if the
+ * readdir loop should terminate. A non-zero result (either errno or
+ * -1) from this function is typically passed directly to
+ * gfs_readdir_fini().
+ */
+int
+gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp,
+ int *ncookies, u_long **cookies)
+{
+ offset_t off, voff;
+ int error;
+
+top:
+ if (uiop->uio_resid <= 0)
+ return (-1);
+
+ off = uiop->uio_loffset / st->grd_ureclen;
+ voff = off - 2;
+ if (off == 0) {
+ if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
+ ".", ncookies, cookies)) == 0)
+ goto top;
+ } else if (off == 1) {
+ if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
+ "..", ncookies, cookies)) == 0)
+ goto top;
+ } else {
+ *voffp = voff;
+ return (0);
+ }
+
+ return (error);
+}
+
+/*
+ * gfs_readdir_fini: generic readdir cleanup
+ * error - if positive, an error to return
+ * eofp - the eofp passed to readdir
+ * eof - the eof value
+ *
+ * Returns a 0 on success, a non-zero errno on failure. This result
+ * should be returned from readdir.
+ */
+int
+gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
+{
+ kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
+ if (error > 0)
+ return (error);
+ if (eofp)
+ *eofp = eof;
+ return (0);
+}
+
+/*
+ * gfs_lookup_dot
+ *
+ * Performs a basic check for "." and ".." directory entries.
+ */
+int
+gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
+{
+ if (*nm == '\0' || strcmp(nm, ".") == 0) {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ } else if (strcmp(nm, "..") == 0) {
+ if (pvp == NULL) {
+ ASSERT(dvp->v_flag & VROOT);
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ } else {
+ VN_HOLD(pvp);
+ *vpp = pvp;
+ }
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ return (0);
+ }
+
+ return (-1);
+}
+
+/*
+ * gfs_file_create(): create a new GFS file
+ *
+ * size - size of private data structure (v_data)
+ * pvp - parent vnode (GFS directory)
+ * ops - vnode operations vector
+ *
+ * In order to use this interface, the parent vnode must have been created by
+ * gfs_dir_create(), and the private data stored in v_data must have a
+ * 'gfs_file_t' as its first field.
+ *
+ * Given these constraints, this routine will automatically:
+ *
+ * - Allocate v_data for the vnode
+ * - Initialize necessary fields in the vnode
+ * - Hold the parent
+ */
+vnode_t *
+gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops)
+{
+ gfs_file_t *fp;
+ vnode_t *vp;
+ int error;
+
+ /*
+ * Allocate vnode and internal data structure
+ */
+ fp = kmem_zalloc(size, KM_SLEEP);
+ error = getnewvnode("zfs", vfsp, ops, &vp);
+ ASSERT(error == 0);
+ vp->v_data = (caddr_t)fp;
+
+ /*
+ * Set up various pointers
+ */
+ fp->gfs_vnode = vp;
+ fp->gfs_parent = pvp;
+ fp->gfs_size = size;
+ fp->gfs_type = GFS_FILE;
+
+ error = insmntque(vp, vfsp);
+ KASSERT(error == 0, ("insmntque() failed: error %d", error));
+
+ /*
+ * Initialize vnode and hold parent.
+ */
+ if (pvp)
+ VN_HOLD(pvp);
+
+ return (vp);
+}
+
+/*
+ * gfs_dir_create: creates a new directory in the parent
+ *
+ * size - size of private data structure (v_data)
+ * pvp - parent vnode (GFS directory)
+ * ops - vnode operations vector
+ * entries - NULL-terminated list of static entries (if any)
+ * maxlen - maximum length of a directory entry
+ * readdir_cb - readdir callback (see gfs_dir_readdir)
+ * inode_cb - inode callback (see gfs_dir_readdir)
+ * lookup_cb - lookup callback (see gfs_dir_lookup)
+ *
+ * In order to use this function, the first member of the private vnode
+ * structure (v_data) must be a gfs_dir_t. For each directory, there are
+ * static entries, defined when the structure is initialized, and dynamic
+ * entries, retrieved through callbacks.
+ *
+ * If a directory has static entries, then it must supply a inode callback,
+ * which will compute the inode number based on the parent and the index.
+ * For a directory with dynamic entries, the caller must supply a readdir
+ * callback and a lookup callback. If a static lookup fails, we fall back to
+ * the supplied lookup callback, if any.
+ *
+ * This function also performs the same initialization as gfs_file_create().
+ */
+vnode_t *
+gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops,
+ gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
+ gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
+{
+ vnode_t *vp;
+ gfs_dir_t *dp;
+ gfs_dirent_t *de;
+
+ vp = gfs_file_create(struct_size, pvp, vfsp, ops);
+ vp->v_type = VDIR;
+
+ dp = vp->v_data;
+ dp->gfsd_file.gfs_type = GFS_DIR;
+ dp->gfsd_maxlen = maxlen;
+
+ if (entries != NULL) {
+ for (de = entries; de->gfse_name != NULL; de++)
+ dp->gfsd_nstatic++;
+
+ dp->gfsd_static = kmem_alloc(
+ dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
+ bcopy(entries, dp->gfsd_static,
+ dp->gfsd_nstatic * sizeof (gfs_dirent_t));
+ }
+
+ dp->gfsd_readdir = readdir_cb;
+ dp->gfsd_lookup = lookup_cb;
+ dp->gfsd_inode = inode_cb;
+
+ mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (vp);
+}
+
+/*
+ * gfs_root_create(): create a root vnode for a GFS filesystem
+ *
+ * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The
+ * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
+ */
+vnode_t *
+gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
+ gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
+ gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
+{
+ vnode_t *vp;
+
+ VFS_HOLD(vfsp);
+ vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb,
+ maxlen, readdir_cb, lookup_cb);
+ /* Manually set the inode */
+ ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
+ vp->v_flag |= VROOT;
+
+ return (vp);
+}
+
+/*
+ * gfs_file_inactive()
+ *
+ * Called from the VOP_INACTIVE() routine. If necessary, this routine will
+ * remove the given vnode from the parent directory and clean up any references
+ * in the VFS layer.
+ *
+ * If the vnode was not removed (due to a race with vget), then NULL is
+ * returned. Otherwise, a pointer to the private data is returned.
+ */
+void *
+gfs_file_inactive(vnode_t *vp)
+{
+ int i;
+ gfs_dirent_t *ge = NULL;
+ gfs_file_t *fp = vp->v_data;
+ gfs_dir_t *dp = NULL;
+ void *data;
+
+ if (fp->gfs_parent == NULL)
+ goto found;
+
+ dp = fp->gfs_parent->v_data;
+
+ /*
+ * First, see if this vnode is cached in the parent.
+ */
+ gfs_dir_lock(dp);
+
+ /*
+ * Find it in the set of static entries.
+ */
+ for (i = 0; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (ge->gfse_vnode == vp)
+ goto found;
+ }
+
+ /*
+ * If 'ge' is NULL, then it is a dynamic entry.
+ */
+ ge = NULL;
+
+found:
+ VI_LOCK(vp);
+ ASSERT(vp->v_count < 2);
+ /*
+ * Really remove this vnode
+ */
+ data = vp->v_data;
+ if (ge != NULL) {
+ /*
+ * If this was a statically cached entry, simply set the
+ * cached vnode to NULL.
+ */
+ ge->gfse_vnode = NULL;
+ }
+ if (vp->v_count == 1) {
+ vp->v_usecount--;
+ vdropl(vp);
+ } else {
+ VI_UNLOCK(vp);
+ }
+
+ /*
+ * Free vnode and release parent
+ */
+ if (fp->gfs_parent) {
+ gfs_dir_unlock(dp);
+ VI_LOCK(fp->gfs_parent);
+ fp->gfs_parent->v_usecount--;
+ VI_UNLOCK(fp->gfs_parent);
+ } else {
+ ASSERT(vp->v_vfsp != NULL);
+ VFS_RELE(vp->v_vfsp);
+ }
+
+ return (data);
+}
+
+/*
+ * gfs_dir_inactive()
+ *
+ * Same as above, but for directories.
+ */
+void *
+gfs_dir_inactive(vnode_t *vp)
+{
+ gfs_dir_t *dp;
+
+ ASSERT(vp->v_type == VDIR);
+
+ if ((dp = gfs_file_inactive(vp)) != NULL) {
+ mutex_destroy(&dp->gfsd_lock);
+ if (dp->gfsd_nstatic)
+ kmem_free(dp->gfsd_static,
+ dp->gfsd_nstatic * sizeof (gfs_dirent_t));
+ }
+
+ return (dp);
+}
+
+/*
+ * gfs_dir_lookup()
+ *
+ * Looks up the given name in the directory and returns the corresponding vnode,
+ * if found.
+ *
+ * First, we search statically defined entries, if any. If a match is found,
+ * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
+ * existing vnode. Otherwise, we call the static entry's callback routine,
+ * caching the result if necessary.
+ *
+ * If no static entry is found, we invoke the lookup callback, if any. The
+ * arguments to this callback are:
+ *
+ * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
+ *
+ * pvp - parent vnode
+ * nm - name of entry
+ * vpp - pointer to resulting vnode
+ *
+ * Returns 0 on success, non-zero on error.
+ */
+int
+gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
+{
+ int i;
+ gfs_dirent_t *ge;
+ vnode_t *vp;
+ gfs_dir_t *dp = dvp->v_data;
+ int ret = 0;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
+ return (0);
+
+ gfs_dir_lock(dp);
+
+ /*
+ * Search static entries.
+ */
+ for (i = 0; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (strcmp(ge->gfse_name, nm) == 0) {
+ if (ge->gfse_vnode) {
+ ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
+ vp = ge->gfse_vnode;
+ VN_HOLD(vp);
+ goto out;
+ }
+
+ /*
+ * We drop the directory lock, as the constructor will
+ * need to do KM_SLEEP allocations. If we return from
+ * the constructor only to find that a parallel
+ * operation has completed, and GFS_CACHE_VNODE is set
+ * for this entry, we discard the result in favor of the
+ * cached vnode.
+ */
+ gfs_dir_unlock(dp);
+ vp = ge->gfse_ctor(dvp);
+ gfs_dir_lock(dp);
+
+ ((gfs_file_t *)vp->v_data)->gfs_index = i;
+
+ /* Set the inode according to the callback. */
+ ((gfs_file_t *)vp->v_data)->gfs_ino =
+ dp->gfsd_inode(dvp, i);
+
+ if (ge->gfse_flags & GFS_CACHE_VNODE) {
+ if (ge->gfse_vnode == NULL) {
+ ge->gfse_vnode = vp;
+ } else {
+ /*
+ * A parallel constructor beat us to it;
+ * return existing vnode. We have to be
+ * careful because we can't release the
+ * current vnode while holding the
+ * directory lock; its inactive routine
+ * will try to lock this directory.
+ */
+ vnode_t *oldvp = vp;
+ vp = ge->gfse_vnode;
+ VN_HOLD(vp);
+
+ gfs_dir_unlock(dp);
+ VN_RELE(oldvp);
+ gfs_dir_lock(dp);
+ }
+ }
+
+ goto out;
+ }
+ }
+
+ /*
+ * See if there is a dynamic constructor.
+ */
+ if (dp->gfsd_lookup) {
+ ino64_t ino;
+ gfs_file_t *fp;
+
+ /*
+ * Once again, drop the directory lock, as the lookup routine
+ * will need to allocate memory, or otherwise deadlock on this
+ * directory.
+ */
+ gfs_dir_unlock(dp);
+ ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
+ gfs_dir_lock(dp);
+ if (ret != 0)
+ goto out;
+
+ fp = (gfs_file_t *)vp->v_data;
+ fp->gfs_index = -1;
+ fp->gfs_ino = ino;
+ } else {
+ /*
+ * No static entry found, and there is no lookup callback, so
+ * return ENOENT.
+ */
+ ret = ENOENT;
+ }
+
+out:
+ gfs_dir_unlock(dp);
+
+ if (ret == 0)
+ *vpp = vp;
+ else
+ *vpp = NULL;
+
+ return (ret);
+}
+
+/*
+ * gfs_dir_readdir: does a readdir() on the given directory
+ *
+ * dvp - directory vnode
+ * uiop - uio structure
+ * eofp - eof pointer
+ * data - arbitrary data passed to readdir callback
+ *
+ * This routine does all the readdir() dirty work. Even so, the caller must
+ * supply two callbacks in order to get full compatibility.
+ *
+ * If the directory contains static entries, an inode callback must be
+ * specified. This avoids having to create every vnode and call VOP_GETATTR()
+ * when reading the directory. This function has the following arguments:
+ *
+ * ino_t gfs_inode_cb(vnode_t *vp, int index);
+ *
+ * vp - vnode for the directory
+ * index - index in original gfs_dirent_t array
+ *
+ * Returns the inode number for the given entry.
+ *
+ * For directories with dynamic entries, a readdir callback must be provided.
+ * This is significantly more complex, thanks to the particulars of
+ * VOP_READDIR().
+ *
+ * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
+ * offset_t *off, offset_t *nextoff, void *data)
+ *
+ * vp - directory vnode
+ * dp - directory entry, sized according to maxlen given to
+ * gfs_dir_create(). callback must fill in d_name and
+ * d_ino.
+ * eofp - callback must set to 1 when EOF has been reached
+ * off - on entry, the last offset read from the directory. Callback
+ * must set to the offset of the current entry, typically left
+ * untouched.
+ * nextoff - callback must set to offset of next entry. Typically
+ * (off + 1)
+ * data - caller-supplied data
+ *
+ * Return 0 on success, or error on failure.
+ */
+int
+gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
+ u_long **cookies, void *data)
+{
+ gfs_readdir_state_t gstate;
+ int error, eof = 0;
+ ino64_t ino, pino;
+ offset_t off, next;
+ gfs_dir_t *dp = dvp->v_data;
+
+ ino = dp->gfsd_file.gfs_ino;
+
+ if (dp->gfsd_file.gfs_parent == NULL)
+ pino = ino; /* root of filesystem */
+ else
+ pino = ((gfs_file_t *)
+ (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
+
+ if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
+ pino, ino)) != 0)
+ return (error);
+
+ while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
+ cookies)) == 0 && !eof) {
+
+ if (off >= 0 && off < dp->gfsd_nstatic) {
+ ino = dp->gfsd_inode(dvp, off);
+
+ if ((error = gfs_readdir_emit(&gstate, uiop,
+ off, ino, dp->gfsd_static[off].gfse_name, ncookies,
+ cookies)) != 0)
+ break;
+
+ } else if (dp->gfsd_readdir) {
+ off -= dp->gfsd_nstatic;
+
+ if ((error = dp->gfsd_readdir(dvp,
+ gstate.grd_dirent, &eof, &off, &next,
+ data)) != 0 || eof)
+ break;
+
+ off += dp->gfsd_nstatic + 2;
+ next += dp->gfsd_nstatic + 2;
+
+ if ((error = gfs_readdir_emit_int(&gstate, uiop,
+ next, ncookies, cookies)) != 0)
+ break;
+ } else {
+ /*
+ * Offset is beyond the end of the static entries, and
+ * we have no dynamic entries. Set EOF.
+ */
+ eof = 1;
+ }
+ }
+
+ return (gfs_readdir_fini(&gstate, error, eofp, eof));
+}
+
+/*
+ * gfs_vop_readdir: VOP_READDIR() entry point
+ *
+ * For use directly in vnode ops table. Given a GFS directory, calls
+ * gfs_dir_readdir() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *ncookies;
+ u_long **a_cookies;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uiop = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ int ncookies = 0;
+ u_long *cookies = NULL;
+ int error;
+
+ if (ap->a_ncookies) {
+ /*
+ * Minimum entry size is dirent size and 1 byte for a file name.
+ */
+ ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
+ cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
+ *ap->a_cookies = cookies;
+ *ap->a_ncookies = ncookies;
+ }
+
+ error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
+
+ if (error == 0) {
+ /* Subtract unused cookies */
+ if (ap->a_ncookies)
+ *ap->a_ncookies -= ncookies;
+ } else if (ap->a_ncookies) {
+ free(*ap->a_cookies, M_TEMP);
+ *ap->a_cookies = NULL;
+ *ap->a_ncookies = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * gfs_vop_inactive: VOP_INACTIVE() entry point
+ *
+ * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
+ * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
+ */
+/* ARGSUSED */
+int
+gfs_vop_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ gfs_file_t *fp = vp->v_data;
+ void *data;
+
+ if (fp->gfs_type == GFS_DIR)
+ data = gfs_dir_inactive(vp);
+ else
+ data = gfs_file_inactive(vp);
+
+ if (data != NULL)
+ kmem_free(data, fp->gfs_size);
+ vp->v_data = NULL;
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c
new file mode 100644
index 000000000000..59a376a1347f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -0,0 +1,2829 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slowes the flow of new data
+ * into the cache until we can make space avaiable.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory preasure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes). We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() inerface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2. We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()). Note however that the data associated
+ * with the buffer may be evicted prior to the callback. The callback
+ * must be made with *no locks held* (to prevent deadlock). Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/dnlc.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/sdt.h>
+
+#define ARC_FREE_AT_ONCE 4194304
+
+static kmutex_t arc_reclaim_thr_lock;
+static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
+static uint8_t arc_thread_exit;
+
+#define ARC_REDUCE_DNLC_PERCENT 3
+uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+
+typedef enum arc_reclaim_strategy {
+ ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
+ ARC_RECLAIM_CONS /* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int arc_grow_retry = 60;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int arc_min_prefetch_lifespan;
+
+static int arc_dead;
+
+/*
+ * These tunables are for performance analysis.
+ */
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+
+/*
+ * Note that buffers can be on one of 5 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recentely used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * When there are no active references to the buffer, they
+ * are linked onto one of the lists in arc. These are the
+ * only buffers that can be evicted or deleted.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ */
+
+typedef struct arc_state {
+ list_t arcs_list; /* linked list of evictable buffer in state */
+ uint64_t arcs_lsize; /* total size of buffers in the linked list */
+ uint64_t arcs_size; /* total size of all buffers in this state */
+ kmutex_t arcs_mtx;
+} arc_state_t;
+
+/* The 5 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_deleted;
+ kstat_named_t arcstat_recycle_miss;
+ kstat_named_t arcstat_mutex_miss;
+ kstat_named_t arcstat_evict_skip;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ kstat_named_t arcstat_size;
+} arc_stats_t;
+
+static arc_stats_t arc_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_misses", KSTAT_DATA_UINT64 },
+ { "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
+ { "mru_hits", KSTAT_DATA_UINT64 },
+ { "mru_ghost_hits", KSTAT_DATA_UINT64 },
+ { "mfu_hits", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "deleted", KSTAT_DATA_UINT64 },
+ { "recycle_miss", KSTAT_DATA_UINT64 },
+ { "mutex_miss", KSTAT_DATA_UINT64 },
+ { "evict_skip", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "p", KSTAT_DATA_UINT64 },
+ { "c", KSTAT_DATA_UINT64 },
+ { "c_min", KSTAT_DATA_UINT64 },
+ { "c_max", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 }
+};
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val));
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+#define ARCSTAT_MAX(stat, val) { \
+ uint64_t m; \
+ while ((val) > (m = arc_stats.stat.value.ui64) && \
+ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
+ continue; \
+}
+
+#define ARCSTAT_MAXSTAT(stat) \
+ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+ if (cond1) { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+ } \
+ } else { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+ } \
+ }
+
+kstat_t *arc_ksp;
+static arc_state_t *arc_anon;
+static arc_state_t *arc_mru;
+static arc_state_t *arc_mru_ghost;
+static arc_state_t *arc_mfu;
+static arc_state_t *arc_mfu_ghost;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+
+static int arc_no_grow; /* Don't try to grow cache size */
+static uint64_t arc_tempreserve;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_done_func_t *acb_done;
+ arc_byteswap_func_t *acb_byteswap;
+ arc_buf_t *acb_buf;
+ zio_t *acb_zio_dummy;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ uint64_t b_cksum0;
+
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_buf_t *b_buf;
+ uint32_t b_flags;
+ uint32_t b_datacnt;
+
+ arc_callback_t *b_acb;
+ kcondvar_t b_cv;
+
+ /* immutable */
+ arc_buf_contents_t b_type;
+ uint64_t b_size;
+ spa_t *b_spa;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ list_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ refcount_t b_refcnt;
+};
+
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static arc_buf_hdr_t arc_eviction_hdr;
+static void arc_get_data_buf(arc_buf_t *buf);
+static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+
+#define GHOST_STATE(state) \
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+
+/*
+ * Private ARC flags. These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read. However, these flags
+ * should never be passed and should only be set by ARC code. When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
+#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
+#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
+#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
+#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
+#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
+#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
+#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_PAD 128
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define BUF_LOCKS 256
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(buf) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+static uint64_t
+buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+{
+ uintptr_t spav = (uintptr_t)spa;
+ uint8_t *vdva = (uint8_t *)dva;
+ uint64_t crc = -1ULL;
+ int i;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ for (i = 0; i < sizeof (dva_t); i++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+ crc ^= (spav>>8) ^ birth;
+
+ return (crc);
+}
+
+#define BUF_EMPTY(buf) \
+ ((buf)->b_dva.dva_word[0] == 0 && \
+ (buf)->b_dva.dva_word[1] == 0 && \
+ (buf)->b_birth == 0)
+
+#define BUF_EQUAL(spa, dva, birth, buf) \
+ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static arc_buf_hdr_t *
+buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *buf;
+
+ mutex_enter(hash_lock);
+ for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+ buf = buf->b_hash_next) {
+ if (BUF_EQUAL(spa, dva, birth, buf)) {
+ *lockp = hash_lock;
+ return (buf);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fbuf;
+ uint32_t i;
+
+ ASSERT(!HDR_IN_HASH_TABLE(buf));
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+ fbuf = fbuf->b_hash_next, i++) {
+ if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+ return (fbuf);
+ }
+
+ buf->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = buf;
+ buf->b_flags |= ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ ARCSTAT_BUMP(arcstat_hash_collisions);
+ if (i == 1)
+ ARCSTAT_BUMP(arcstat_hash_chains);
+
+ ARCSTAT_MAX(arcstat_hash_chain_max, i);
+ }
+
+ ARCSTAT_BUMP(arcstat_hash_elements);
+ ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+ arc_buf_hdr_t *fbuf, **bufp;
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(buf));
+
+ bufp = &buf_hash_table.ht_table[idx];
+ while ((fbuf = *bufp) != buf) {
+ ASSERT(fbuf != NULL);
+ bufp = &fbuf->b_hash_next;
+ }
+ *bufp = buf->b_hash_next;
+ buf->b_hash_next = NULL;
+ buf->b_flags &= ~ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_hdr_t));
+ refcount_create(&buf->b_refcnt);
+ cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ refcount_destroy(&buf->b_refcnt);
+ cv_destroy(&buf->b_cv);
+}
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+ dprintf("hdr_recl called\n");
+ /*
+ * umem calls the reclaim func when we destroy the buf cache,
+ * which is after we do arc_fini().
+ */
+ if (!arc_dead)
+ cv_signal(&arc_reclaim_thr_cv);
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct;
+ uint64_t hsize = 1ULL << 12;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 64K block size. The table will take up
+ * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+ */
+ while (hsize * 65536 < physmem * PAGESIZE)
+ hsize <<= 1;
+retry:
+ buf_hash_table.ht_mask = hsize - 1;
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+ 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL ||
+ (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+ panic("buffer modified while frozen!");
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+static void
+arc_cksum_compute(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
+ buf->b_hdr->b_freeze_cksum);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ buf->b_hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
+ buf->b_hdr->b_state == arc_anon);
+ arc_cksum_compute(buf);
+}
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+ (ab->b_state != arc_anon)) {
+ uint64_t delta = ab->b_size * ab->b_datacnt;
+
+ ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
+ mutex_enter(&ab->b_state->arcs_mtx);
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&ab->b_state->arcs_list, ab);
+ if (GHOST_STATE(ab->b_state)) {
+ ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT3P(ab->b_buf, ==, NULL);
+ delta = ab->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
+ atomic_add_64(&ab->b_state->arcs_lsize, -delta);
+ mutex_exit(&ab->b_state->arcs_mtx);
+ /* remove the prefetch flag is we get a reference */
+ if (ab->b_flags & ARC_PREFETCH)
+ ab->b_flags &= ~ARC_PREFETCH;
+ }
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+ arc_state_t *state = ab->b_state;
+
+ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(state));
+
+ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+ (state != arc_anon)) {
+ ASSERT(!MUTEX_HELD(&state->arcs_mtx));
+ mutex_enter(&state->arcs_mtx);
+ ASSERT(!list_link_active(&ab->b_arc_node));
+ list_insert_head(&state->arcs_list, ab);
+ ASSERT(ab->b_datacnt > 0);
+ atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
+ ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
+ mutex_exit(&state->arcs_mtx);
+ }
+ return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+{
+ arc_state_t *old_state = ab->b_state;
+ int64_t refcnt = refcount_count(&ab->b_refcnt);
+ uint64_t from_delta, to_delta;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(new_state != old_state);
+ ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+ ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+
+ from_delta = to_delta = ab->b_datacnt * ab->b_size;
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcnt == 0) {
+ if (old_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+
+ if (use_mutex)
+ mutex_enter(&old_state->arcs_mtx);
+
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&old_state->arcs_list, ab);
+
+ /*
+ * If prefetching out of the ghost cache,
+ * we will have a non-null datacnt.
+ */
+ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
+ /* ghost elements have a ghost size */
+ ASSERT(ab->b_buf == NULL);
+ from_delta = ab->b_size;
+ }
+ ASSERT3U(old_state->arcs_lsize, >=, from_delta);
+ atomic_add_64(&old_state->arcs_lsize, -from_delta);
+
+ if (use_mutex)
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ if (new_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+
+ if (use_mutex)
+ mutex_enter(&new_state->arcs_mtx);
+
+ list_insert_head(&new_state->arcs_list, ab);
+
+ /* ghost elements have a ghost size */
+ if (GHOST_STATE(new_state)) {
+ ASSERT(ab->b_datacnt == 0);
+ ASSERT(ab->b_buf == NULL);
+ to_delta = ab->b_size;
+ }
+ atomic_add_64(&new_state->arcs_lsize, to_delta);
+ ASSERT3U(new_state->arcs_size + to_delta, >=,
+ new_state->arcs_lsize);
+
+ if (use_mutex)
+ mutex_exit(&new_state->arcs_mtx);
+ }
+ }
+
+ ASSERT(!BUF_EMPTY(ab));
+ if (new_state == arc_anon && old_state != arc_anon) {
+ buf_hash_remove(ab);
+ }
+
+ /* adjust state sizes */
+ if (to_delta)
+ atomic_add_64(&new_state->arcs_size, to_delta);
+ if (from_delta) {
+ ASSERT3U(old_state->arcs_size, >=, from_delta);
+ atomic_add_64(&old_state->arcs_size, -from_delta);
+ }
+ ab->b_state = new_state;
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = spa;
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ hdr->b_datacnt = 1;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ (void) refcount_add(&hdr->b_refcnt, tag);
+
+ return (buf);
+}
+
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
+{
+ arc_buf_t *buf;
+ arc_buf_hdr_t *hdr = from->b_hdr;
+ uint64_t size = hdr->b_size;
+
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = hdr->b_buf;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ bcopy(from->b_data, buf->b_data, size);
+ hdr->b_datacnt += 1;
+ return (buf);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+
+ /*
+ * Check to see if this buffer is currently being evicted via
+ * arc_do_user_evicts().
+ */
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ mutex_exit(&arc_eviction_mtx);
+ return;
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_exit(&arc_eviction_mtx);
+
+ mutex_enter(hash_lock);
+ if (buf->b_data == NULL) {
+ /*
+ * This buffer is evicted.
+ */
+ mutex_exit(hash_lock);
+ return;
+ }
+
+ ASSERT(buf->b_hdr == hdr);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ add_reference(hdr, hash_lock, tag);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+{
+ arc_buf_t **bufp;
+
+ /* free up data associated with the buf */
+ if (buf->b_data) {
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_cksum_verify(buf);
+ if (!recycle) {
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf->b_data, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf->b_data, size);
+ }
+ atomic_add_64(&arc_size, -size);
+ }
+ if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(state != arc_anon);
+ ASSERT3U(state->arcs_lsize, >=, size);
+ atomic_add_64(&state->arcs_lsize, -size);
+ }
+ ASSERT3U(state->arcs_size, >=, size);
+ atomic_add_64(&state->arcs_size, -size);
+ buf->b_data = NULL;
+ ASSERT(buf->b_hdr->b_datacnt > 0);
+ buf->b_hdr->b_datacnt -= 1;
+ }
+
+ /* only remove the buf if requested */
+ if (!all)
+ return;
+
+ /* remove the buf from the hdr list */
+ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ continue;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_efunc == NULL);
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ if (!BUF_EMPTY(hdr)) {
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ }
+ while (hdr->b_buf) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
+ hdr->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+ }
+ }
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_destroy(&hdr->b_freeze_lock);
+
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ ASSERT3P(hdr->b_acb, ==, NULL);
+ kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ int hashed = hdr->b_state != arc_anon;
+
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(buf->b_data != NULL);
+
+ if (hashed) {
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ mutex_enter(hash_lock);
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ else
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ mutex_exit(hash_lock);
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ int destroy_hdr;
+ /*
+ * We are in the middle of an async write. Don't destroy
+ * this buffer unless the write completes before we finish
+ * decrementing the reference count.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ (void) remove_reference(hdr, NULL, tag);
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ if (remove_reference(hdr, NULL, tag) > 0) {
+ ASSERT(HDR_IO_ERROR(hdr));
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else {
+ arc_hdr_destroy(hdr);
+ }
+ }
+}
+
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ int no_callback = (buf->b_efunc == NULL);
+
+ if (hdr->b_state == arc_anon) {
+ arc_buf_free(buf, tag);
+ return (no_callback);
+ }
+
+ mutex_enter(hash_lock);
+ ASSERT(hdr->b_state != arc_anon);
+ ASSERT(buf->b_data != NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1) {
+ if (no_callback)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else if (no_callback) {
+ ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+ ASSERT(no_callback || hdr->b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_refcnt));
+ mutex_exit(hash_lock);
+ return (no_callback);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes. Move the removed buffers to the appropriate evict state.
+ * If the recycle flag is set, then attempt to "recycle" a buffer:
+ * - look for a buffer to evict that is `bytes' long.
+ * - return the data block from this buffer rather than freeing it.
+ * This flag is used by callers that are trying to make space for a
+ * new buffer in a full arc cache.
+ */
+static void *
+arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+ arc_buf_contents_t type)
+{
+ arc_state_t *evicted_state;
+ uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+ arc_buf_hdr_t *ab, *ab_prev = NULL;
+ kmutex_t *hash_lock;
+ boolean_t have_lock;
+ void *stolen = NULL;
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
+ ab_prev = list_prev(&state->arcs_list, ab);
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(ab) ||
+ (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
+ lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+ skipped++;
+ continue;
+ }
+ /* "lookahead" for better eviction candidate */
+ if (recycle && ab->b_size != bytes &&
+ ab_prev && ab_prev->b_size == bytes)
+ continue;
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (have_lock || mutex_tryenter(hash_lock)) {
+ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT(ab->b_datacnt > 0);
+ while (ab->b_buf) {
+ arc_buf_t *buf = ab->b_buf;
+ if (buf->b_data) {
+ bytes_evicted += ab->b_size;
+ if (recycle && ab->b_type == type &&
+ ab->b_size == bytes) {
+ stolen = buf->b_data;
+ recycle = FALSE;
+ }
+ }
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, FALSE);
+ ab->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, TRUE);
+ }
+ }
+ ASSERT(ab->b_datacnt == 0);
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags = ARC_IN_HASH_TABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ if (!have_lock)
+ mutex_exit(hash_lock);
+ if (bytes >= 0 && bytes_evicted >= bytes)
+ break;
+ } else {
+ missed += 1;
+ }
+ }
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&state->arcs_mtx);
+
+ if (bytes_evicted < bytes)
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+
+ if (skipped)
+ ARCSTAT_INCR(arcstat_evict_skip, skipped);
+
+ if (missed)
+ ARCSTAT_INCR(arcstat_mutex_miss, missed);
+
+ return (stolen);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes. Destroy the buffers that are removed.
+ */
+static void
+arc_evict_ghost(arc_state_t *state, int64_t bytes)
+{
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t bytes_deleted = 0;
+ uint64_t bufs_skipped = 0;
+
+ ASSERT(GHOST_STATE(state));
+top:
+ mutex_enter(&state->arcs_mtx);
+ for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
+ ab_prev = list_prev(&state->arcs_list, ab);
+ hash_lock = HDR_LOCK(ab);
+ if (mutex_tryenter(hash_lock)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(ab));
+ ASSERT(ab->b_buf == NULL);
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_deleted += ab->b_size;
+ arc_hdr_destroy(ab);
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+ if (bytes >= 0 && bytes_deleted >= bytes)
+ break;
+ } else {
+ if (bytes < 0) {
+ mutex_exit(&state->arcs_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ bufs_skipped += 1;
+ }
+ }
+ mutex_exit(&state->arcs_mtx);
+
+ if (bufs_skipped) {
+ ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
+ ASSERT(bytes >= 0);
+ }
+
+ if (bytes_deleted < bytes)
+ dprintf("only deleted %lld bytes from %p",
+ (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+ int64_t top_sz, mru_over, arc_over, todelete;
+
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
+ int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
+ (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
+
+ mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0) {
+ if (arc_mru_ghost->arcs_lsize > 0) {
+ todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
+ arc_evict_ghost(arc_mru_ghost, todelete);
+ }
+ }
+
+ if ((arc_over = arc_size - arc_c) > 0) {
+ int64_t tbl_over;
+
+ if (arc_mfu->arcs_lsize > 0) {
+ int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
+ (void) arc_evict(arc_mfu, toevict, FALSE,
+ ARC_BUFC_UNDEF);
+ }
+
+ tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
+ arc_mfu_ghost->arcs_lsize - arc_c*2;
+
+ if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
+ todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
+ arc_evict_ghost(arc_mfu_ghost, todelete);
+ }
+ }
+}
+
+static void
+arc_do_user_evicts(void)
+{
+ mutex_enter(&arc_eviction_mtx);
+ while (arc_eviction_list != NULL) {
+ arc_buf_t *buf = arc_eviction_list;
+ arc_eviction_list = buf->b_next;
+ buf->b_hdr = NULL;
+ mutex_exit(&arc_eviction_mtx);
+
+ if (buf->b_efunc != NULL)
+ VERIFY(buf->b_efunc(buf) == 0);
+
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ kmem_cache_free(buf_cache, buf);
+ mutex_enter(&arc_eviction_mtx);
+ }
+ mutex_exit(&arc_eviction_mtx);
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(void)
+{
+ while (list_head(&arc_mru->arcs_list))
+ (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
+ while (list_head(&arc_mfu->arcs_list))
+ (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
+
+ arc_evict_ghost(arc_mru_ghost, -1);
+ arc_evict_ghost(arc_mfu_ghost, -1);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_do_user_evicts();
+ mutex_exit(&arc_reclaim_thr_lock);
+ ASSERT(arc_eviction_list == NULL);
+}
+
+int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
+
+void
+arc_shrink(void)
+{
+ if (arc_c > arc_c_min) {
+ uint64_t to_free;
+
+#ifdef _KERNEL
+ to_free = arc_c >> arc_shrink_shift;
+#else
+ to_free = arc_c >> arc_shrink_shift;
+#endif
+ if (arc_c > arc_c_min + to_free)
+ atomic_add_64(&arc_c, -to_free);
+ else
+ arc_c = arc_c_min;
+
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ if (arc_c > arc_size)
+ arc_c = MAX(arc_size, arc_c_min);
+ if (arc_p > arc_c)
+ arc_p = (arc_c >> 1);
+ ASSERT(arc_c >= arc_c_min);
+ ASSERT((int64_t)arc_p >= 0);
+ }
+
+ if (arc_size > arc_c)
+ arc_adjust();
+}
+
+static int zfs_needfree = 0;
+
+static int
+arc_reclaim_needed(void)
+{
+#if 0
+ uint64_t extra;
+#endif
+
+#ifdef _KERNEL
+
+ if (zfs_needfree)
+ return (1);
+
+#if 0
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+ return (1);
+
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then check that the size of available vmem for this area remains
+ * above 1/4th free. This needs to be done when the size of the
+ * non-default segment is smaller than physical memory, so we could
+ * conceivably run out of VA in that segment before running out of
+ * physical memory.
+ */
+ if (zio_arena != NULL) {
+ size_t arc_ziosize =
+ btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
+
+ if ((physmem > arc_ziosize) &&
+ (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
+ return (1);
+ }
+
+#if defined(__i386)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * free)
+ */
+ if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+ (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+ return (1);
+#endif
+#else
+ if (kmem_map->size > (vm_kmem_size * 3) / 4)
+ return (1);
+#endif
+
+#else
+ if (spa_get_random(100) == 0)
+ return (1);
+#endif
+ return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+#ifdef ZIO_USE_UMA
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
+ extern kmem_cache_t *zio_buf_cache[];
+ extern kmem_cache_t *zio_data_buf_cache[];
+#endif
+
+#ifdef _KERNEL
+ /*
+ * First purge some DNLC entries, in case the DNLC is using
+ * up too much memory.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+
+#if defined(__i386)
+ /*
+ * Reclaim unused memory from all kmem caches.
+ */
+ kmem_reap();
+#endif
+#endif
+
+ /*
+ * An agressive reclamation will shrink the cache size as well as
+ * reap free buffers from the arc kmem caches.
+ */
+ if (strat == ARC_RECLAIM_AGGR)
+ arc_shrink();
+
+#ifdef ZIO_USE_UMA
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_now(zio_buf_cache[i]);
+ }
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_now(zio_data_buf_cache[i]);
+ }
+ }
+#endif
+ kmem_cache_reap_now(buf_cache);
+ kmem_cache_reap_now(hdr_cache);
+}
+
+static void
+arc_reclaim_thread(void *dummy __unused)
+{
+ clock_t growtime = 0;
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ while (arc_thread_exit == 0) {
+ if (arc_reclaim_needed()) {
+
+ if (arc_no_grow) {
+ if (last_reclaim == ARC_RECLAIM_CONS) {
+ last_reclaim = ARC_RECLAIM_AGGR;
+ } else {
+ last_reclaim = ARC_RECLAIM_CONS;
+ }
+ } else {
+ arc_no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ membar_producer();
+ }
+
+ /* reset the growth delay for every reclaim */
+ growtime = lbolt + (arc_grow_retry * hz);
+ ASSERT(growtime > 0);
+
+ if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
+ /*
+ * If zfs_needfree is TRUE our vm_lowmem hook
+ * was called and in that case we must free some
+ * memory, so switch to aggressive mode.
+ */
+ arc_no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ }
+ arc_kmem_reap_now(last_reclaim);
+ } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
+ arc_no_grow = FALSE;
+ }
+
+ if (zfs_needfree ||
+ (2 * arc_c < arc_size +
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
+ arc_adjust();
+
+ if (arc_eviction_list != NULL)
+ arc_do_user_evicts();
+
+ if (arc_reclaim_needed()) {
+ zfs_needfree = 0;
+#ifdef _KERNEL
+ wakeup(&zfs_needfree);
+#endif
+ }
+
+ /* block until needed, or one second, whichever is shorter */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_reclaim_thr_cv,
+ &arc_reclaim_thr_lock, hz);
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ }
+
+ arc_thread_exit = 0;
+ cv_broadcast(&arc_reclaim_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ thread_exit();
+}
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from. This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+ int mult;
+
+ ASSERT(bytes > 0);
+ /*
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
+ */
+ if (state == arc_mru_ghost) {
+ mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
+ 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+
+ arc_p = MIN(arc_c, arc_p + bytes * mult);
+ } else if (state == arc_mfu_ghost) {
+ mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
+ 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+
+ arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
+ }
+ ASSERT((int64_t)arc_p >= 0);
+
+ if (arc_reclaim_needed()) {
+ cv_signal(&arc_reclaim_thr_cv);
+ return;
+ }
+
+ if (arc_no_grow)
+ return;
+
+ if (arc_c >= arc_c_max)
+ return;
+
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ atomic_add_64(&arc_c, (int64_t)bytes);
+ if (arc_c > arc_c_max)
+ arc_c = arc_c_max;
+ else if (state == arc_anon)
+ atomic_add_64(&arc_p, (int64_t)bytes);
+ if (arc_p > arc_c)
+ arc_p = arc_c;
+ }
+ ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
+ */
+static int
+arc_evict_needed()
+{
+ if (arc_reclaim_needed())
+ return (1);
+
+ return (arc_size > arc_c);
+}
+
+/*
+ * The buffer, supplied as the first argument, needs a data block.
+ * So, if we are at cache max, determine which cache should be victimized.
+ * We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead. Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU. In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
+ * MFU's resident set is consuming more space than it has been allotted. In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_get_data_buf(arc_buf_t *buf)
+{
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_adapt(size, state);
+
+ /*
+ * We have not yet reached cache maximum size,
+ * just allocate a new buffer.
+ */
+ if (!arc_evict_needed()) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ }
+ atomic_add_64(&arc_size, size);
+ goto out;
+ }
+
+ /*
+ * If we are prefetching from the mfu ghost list, this buffer
+ * will end up on the mru list; so steal space from there.
+ */
+ if (state == arc_mfu_ghost)
+ state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
+ else if (state == arc_mru_ghost)
+ state = arc_mru;
+
+ if (state == arc_mru || state == arc_anon) {
+ uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
+ state = (arc_p > mru_used) ? arc_mfu : arc_mru;
+ } else {
+ /* MFU cases */
+ uint64_t mfu_space = arc_c - arc_p;
+ state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ }
+ if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ }
+ atomic_add_64(&arc_size, size);
+ ARCSTAT_BUMP(arcstat_recycle_miss);
+ }
+ ASSERT(buf->b_data != NULL);
+out:
+ /*
+ * Update the state size. Note that ghost states have a
+ * "ghost size" and so don't need to be updated.
+ */
+ if (!GHOST_STATE(buf->b_hdr->b_state)) {
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ atomic_add_64(&hdr->b_state->arcs_size, size);
+ if (list_link_active(&hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ atomic_add_64(&hdr->b_state->arcs_lsize, size);
+ }
+ /*
+ * If we are growing the cache, and we are adding anonymous
+ * data, and we have outgrown arc_p, update arc_p
+ */
+ if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ arc_p = MIN(arc_c, arc_p + size);
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if (buf->b_state == arc_anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ ASSERT(buf->b_arc_access == 0);
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mru, buf, hash_lock);
+
+ } else if (buf->b_state == arc_mru) {
+ /*
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ if (refcount_count(&buf->b_refcnt) == 0) {
+ ASSERT(list_link_active(&buf->b_arc_node));
+ mutex_enter(&arc_mru->arcs_mtx);
+ list_remove(&arc_mru->arcs_list, buf);
+ list_insert_head(&arc_mru->arcs_list, buf);
+ mutex_exit(&arc_mru->arcs_mtx);
+ } else {
+ buf->b_flags &= ~ARC_PREFETCH;
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ }
+ buf->b_arc_access = lbolt;
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
+ }
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ } else if (buf->b_state == arc_mru_ghost) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ new_state = arc_mru;
+ if (refcount_count(&buf->b_refcnt) > 0)
+ buf->b_flags &= ~ARC_PREFETCH;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ } else {
+ new_state = arc_mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ }
+
+ buf->b_arc_access = lbolt;
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ } else if (buf->b_state == arc_mfu) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ ASSERT(refcount_count(&buf->b_refcnt) == 0);
+ ASSERT(list_link_active(&buf->b_arc_node));
+ mutex_enter(&arc_mfu->arcs_mtx);
+ list_remove(&arc_mfu->arcs_list, buf);
+ list_insert_head(&arc_mfu->arcs_list, buf);
+ mutex_exit(&arc_mfu->arcs_mtx);
+ }
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ buf->b_arc_access = lbolt;
+ } else if (buf->b_state == arc_mfu_ghost) {
+ arc_state_t *new_state = arc_mfu;
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ new_state = arc_mru;
+ }
+
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else {
+ ASSERT(!"invalid arc state");
+ }
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+}
+
+/* a generic arc_done_func_t which you can use */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+ if (zio && zio->io_error) {
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+ *bufp = NULL;
+ } else {
+ *bufp = buf;
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ arc_buf_hdr_t *hdr, *found;
+ arc_buf_t *buf;
+ arc_buf_t *abuf; /* buffer we're assigning to callback */
+ kmutex_t *hash_lock;
+ arc_callback_t *callback_list, *acb;
+ int freeable = FALSE;
+
+ buf = zio->io_private;
+ hdr = buf->b_hdr;
+
+ /*
+ * The hdr was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O. The only possible
+ * reason for it not to be found is if we were freed during the
+ * read.
+ */
+ found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+ &hash_lock);
+
+ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+
+ /* byteswap if necessary */
+ callback_list = hdr->b_acb;
+ ASSERT(callback_list != NULL);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
+ callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+
+ arc_cksum_compute(buf);
+
+ /* create copies of the data buffer for the callers */
+ abuf = buf;
+ for (acb = callback_list; acb; acb = acb->acb_next) {
+ if (acb->acb_done) {
+ if (abuf == NULL)
+ abuf = arc_buf_clone(buf);
+ acb->acb_buf = abuf;
+ abuf = NULL;
+ }
+ }
+ hdr->b_acb = NULL;
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ ASSERT(!HDR_BUF_AVAILABLE(hdr));
+ if (abuf == buf)
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+ if (zio->io_error != 0) {
+ hdr->b_flags |= ARC_IO_ERROR;
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ /* convert checksum errors into IO errors */
+ if (zio->io_error == ECKSUM)
+ zio->io_error = EIO;
+ }
+
+ /*
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_cv);
+
+ if (hash_lock) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ if (zio->io_error == 0 && hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done)
+ acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t *arc_flags, zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ zio_t *rzio;
+
+top:
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (hdr && hdr->b_datacnt > 0) {
+
+ *arc_flags |= ARC_CACHED;
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+
+ if (*arc_flags & ARC_WAIT) {
+ cv_wait(&hdr->b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_NOWAIT);
+
+ if (done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, flags);
+
+ ASSERT(acb->acb_done != NULL);
+ acb->acb_next = hdr->b_acb;
+ hdr->b_acb = acb;
+ add_reference(hdr, hash_lock, private);
+ mutex_exit(hash_lock);
+ return (0);
+ }
+ mutex_exit(hash_lock);
+ return (0);
+ }
+
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ if (done) {
+ add_reference(hdr, hash_lock, private);
+ /*
+ * If this block is already in use, create a new
+ * copy of the data so that we will be guaranteed
+ * that arc_release() will always succeed.
+ */
+ buf = hdr->b_buf;
+ ASSERT(buf);
+ ASSERT(buf->b_data);
+ if (HDR_BUF_AVAILABLE(hdr)) {
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ } else {
+ buf = arc_buf_clone(buf);
+ }
+ } else if (*arc_flags & ARC_PREFETCH &&
+ refcount_count(&hdr->b_refcnt) == 0) {
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+
+ if (done)
+ done(NULL, buf, private);
+ } else {
+ uint64_t size = BP_GET_LSIZE(bp);
+ arc_callback_t *acb;
+
+ if (hdr == NULL) {
+ /* this block is not in the cache */
+ arc_buf_hdr_t *exists;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ buf = arc_buf_alloc(spa, size, private, type);
+ hdr = buf->b_hdr;
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = bp->blk_birth;
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ (void) arc_buf_remove_ref(buf, private);
+ goto top; /* restart the IO request */
+ }
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH) {
+ (void) remove_reference(hdr, hash_lock,
+ private);
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ if (BP_GET_LEVEL(bp) > 0)
+ hdr->b_flags |= ARC_INDIRECT;
+ } else {
+ /* this block is in the ghost cache */
+ ASSERT(GHOST_STATE(hdr->b_state));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+ ASSERT(hdr->b_buf == NULL);
+
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH)
+ hdr->b_flags |= ARC_PREFETCH;
+ else
+ add_reference(hdr, hash_lock, private);
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ ASSERT(hdr->b_datacnt == 0);
+ hdr->b_datacnt = 1;
+
+ }
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+
+ ASSERT(hdr->b_acb == NULL);
+ hdr->b_acb = acb;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+ /*
+ * If the buffer has been evicted, migrate it to a present state
+ * before issuing the I/O. Once we drop the hash-table lock,
+ * the header will be marked as I/O in progress and have an
+ * attached buffer. At this point, anybody who finds this
+ * buffer ought to notice that it's legit but has a pending I/O.
+ */
+
+ if (GHOST_STATE(hdr->b_state))
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ ASSERT3U(hdr->b_size, ==, size);
+ DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
+ zbookmark_t *, zb);
+ ARCSTAT_BUMP(arcstat_misses);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, misses);
+
+ rzio = zio_read(pio, spa, bp, buf->b_data, size,
+ arc_read_done, buf, priority, flags, zb);
+
+ if (*arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ }
+ return (0);
+}
+
+/*
+ * arc_read() variant to support pool traversal. If the block is already
+ * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
+ * The idea is that we don't want pool traversal filling up memory, but
+ * if the ARC already has the data anyway, we shouldn't pay for the I/O.
+ */
+int
+arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_mtx;
+ int rc = 0;
+
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+
+ if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ ASSERT(buf);
+ while (buf->b_data == NULL) {
+ buf = buf->b_next;
+ ASSERT(buf);
+ }
+ bcopy(buf->b_data, data, hdr->b_size);
+ } else {
+ rc = ENOENT;
+ }
+
+ if (hash_mtx)
+ mutex_exit(hash_mtx);
+
+ return (rc);
+}
+
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+ ASSERT(buf->b_hdr != NULL);
+ ASSERT(buf->b_hdr->b_state != arc_anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ buf->b_efunc = func;
+ buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up. If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ arc_buf_t **bufp;
+
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts().
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_exit(&arc_eviction_mtx);
+
+ mutex_enter(hash_lock);
+
+ if (buf->b_data == NULL) {
+ /*
+ * We are on the eviction list.
+ */
+ mutex_exit(hash_lock);
+ mutex_enter(&arc_eviction_mtx);
+ if (buf->b_hdr == NULL) {
+ /*
+ * We are already in arc_do_user_evicts().
+ */
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ } else {
+ arc_buf_t copy = *buf; /* structure assignment */
+ /*
+ * Process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
+ */
+ buf->b_efunc = NULL;
+ mutex_exit(&arc_eviction_mtx);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
+ }
+ }
+
+ ASSERT(buf->b_hdr == hdr);
+ ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ /*
+ * Pull this buffer off of the hdr
+ */
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_data != NULL);
+ arc_buf_destroy(buf, FALSE, FALSE);
+
+ if (hdr->b_datacnt == 0) {
+ arc_state_t *old_state = hdr->b_state;
+ arc_state_t *evicted_state;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ evicted_state =
+ (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&old_state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags = ARC_IN_HASH_TABLE;
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ mutex_exit(hash_lock);
+
+ VERIFY(buf->b_efunc(buf) == 0);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+ return (1);
+}
+
+/*
+ * Release this buffer from the cache. This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * make a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+ if (hdr->b_state == arc_anon) {
+ /* this buffer is already released */
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(buf->b_efunc == NULL);
+ arc_buf_thaw(buf);
+ return;
+ }
+
+ mutex_enter(hash_lock);
+
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_buf != buf || buf->b_next != NULL) {
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t **bufp;
+ uint64_t blksz = hdr->b_size;
+ spa_t *spa = hdr->b_spa;
+ arc_buf_contents_t type = hdr->b_type;
+
+ ASSERT(hdr->b_datacnt > 1);
+ /*
+ * Pull the data off of this buf and attach it to
+ * a new anonymous buf.
+ */
+ (void) remove_reference(hdr, hash_lock, tag);
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = (*bufp)->b_next;
+ buf->b_next = NULL;
+
+ ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_refcnt)) {
+ ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
+ }
+ hdr->b_datacnt -= 1;
+ arc_cksum_verify(buf);
+
+ mutex_exit(hash_lock);
+
+ nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ nhdr->b_size = blksz;
+ nhdr->b_spa = spa;
+ nhdr->b_type = type;
+ nhdr->b_buf = buf;
+ nhdr->b_state = arc_anon;
+ nhdr->b_arc_access = 0;
+ nhdr->b_flags = 0;
+ nhdr->b_datacnt = 1;
+ nhdr->b_freeze_cksum = NULL;
+ (void) refcount_add(&nhdr->b_refcnt, tag);
+ buf->b_hdr = nhdr;
+ atomic_add_64(&arc_anon->arcs_size, blksz);
+
+ hdr = nhdr;
+ } else {
+ ASSERT(refcount_count(&hdr->b_refcnt) == 1);
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_arc_access = 0;
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ arc_buf_thaw(buf);
+ }
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+ return (buf->b_efunc != NULL);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ return (refcount_count(&buf->b_hdr->b_refcnt));
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+
+ if (callback->awcb_ready) {
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+ }
+ arc_cksum_compute(buf);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ hdr->b_acb = NULL;
+
+ /* this buffer is on no lists and is not in the hash table */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = zio->io_bp->blk_birth;
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ /*
+ * If the block to be written was all-zero, we may have
+ * compressed it away. In this case no write was performed
+ * so there will be no dva/birth-date/checksum. The buffer
+ * must therefor remain anonymous (and uncached).
+ */
+ if (!BUF_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ arc_cksum_verify(buf);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+ BP_IDENTITY(zio->io_bp)));
+ ASSERT3U(zio->io_bp_orig.blk_birth, ==,
+ zio->io_bp->blk_birth);
+
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ }
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else if (callback->awcb_done == NULL) {
+ int destroy_hdr;
+ /*
+ * This is an anonymous buffer with no user callback,
+ * destroy it if there are no active references.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ }
+
+ if (callback->awcb_done) {
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
+ }
+
+ kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+ uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_write_callback_t *callback;
+ zio_t *zio;
+
+ /* this is a private buffer - no locking required */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
+ ASSERT(hdr->b_acb == 0);
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+ zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
+ buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
+ priority, flags, zb);
+
+ return (zio);
+}
+
+int
+arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+ arc_buf_hdr_t *ab;
+ kmutex_t *hash_lock;
+ zio_t *zio;
+
+ /*
+ * If this buffer is in the cache, release it, so it
+ * can be re-used.
+ */
+ ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (ab != NULL) {
+ /*
+ * The checksum of blocks to free is not always
+ * preserved (eg. on the deadlist). However, if it is
+ * nonzero, it should match what we have in the cache.
+ */
+ ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
+ ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ if (ab->b_state != arc_anon)
+ arc_change_state(arc_anon, ab, hash_lock);
+ if (HDR_IO_IN_PROGRESS(ab)) {
+ /*
+ * This should only happen when we prefetch.
+ */
+ ASSERT(ab->b_flags & ARC_PREFETCH);
+ ASSERT3U(ab->b_datacnt, ==, 1);
+ ab->b_flags |= ARC_FREED_IN_READ;
+ if (HDR_IN_HASH_TABLE(ab))
+ buf_hash_remove(ab);
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ } else if (refcount_is_zero(&ab->b_refcnt)) {
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ ARCSTAT_BUMP(arcstat_deleted);
+ } else {
+ /*
+ * We still have an active reference on this
+ * buffer. This can happen, e.g., from
+ * dbuf_unoverride().
+ */
+ ASSERT(!HDR_IN_HASH_TABLE(ab));
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ }
+ }
+
+ zio = zio_free(pio, spa, txg, bp, done, private);
+
+ if (arc_flags & ARC_WAIT)
+ return (zio_wait(zio));
+
+ ASSERT(arc_flags & ARC_NOWAIT);
+ zio_nowait(zio);
+
+ return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t tempreserve)
+{
+ atomic_add_64(&arc_tempreserve, -tempreserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t tempreserve)
+{
+#ifdef ZFS_DEBUG
+ /*
+ * Once in a while, fail for no reason. Everything should cope.
+ */
+ if (spa_get_random(10000) == 0) {
+ dprintf("forcing random failure\n");
+ return (ERESTART);
+ }
+#endif
+ if (tempreserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, tempreserve * 4);
+ if (tempreserve > arc_c)
+ return (ENOMEM);
+
+ /*
+ * Throttle writes when the amount of dirty data in the cache
+ * gets too large. We try to keep the cache less than half full
+ * of dirty blocks so that our sync times don't grow too large.
+ * Note: if two requests come in concurrently, we might let them
+ * both succeed, when one of them should fail. Not a huge deal.
+ *
+ * XXX The limit should be adjusted dynamically to keep the time
+ * to sync a dataset fixed (around 1-5 seconds?).
+ */
+
+ if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+ arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
+ "tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
+ tempreserve>>10, arc_c>>10);
+ return (ERESTART);
+ }
+ atomic_add_64(&arc_tempreserve, tempreserve);
+ return (0);
+}
+
+#ifdef _KERNEL
+static eventhandler_tag zfs_event_lowmem = NULL;
+
+static void
+zfs_lowmem(void *arg __unused, int howto __unused)
+{
+
+ zfs_needfree = 1;
+ cv_signal(&arc_reclaim_thr_cv);
+ while (zfs_needfree)
+ tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
+}
+#endif
+
+void
+arc_init(void)
+{
+ mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Convert seconds to clock ticks */
+ arc_min_prefetch_lifespan = 1 * hz;
+
+ /* Start out with 1/8 of all memory */
+ arc_c = physmem * PAGESIZE / 8;
+#if 0
+#ifdef _KERNEL
+ /*
+ * On architectures where the physical memory can be larger
+ * than the addressable space (intel in 32-bit mode), we may
+ * need to limit the cache to 1/8 of VM size.
+ */
+ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+#endif
+ /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
+ arc_c_min = MAX(arc_c / 4, 64<<20);
+ /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
+ if (arc_c * 8 >= 1<<30)
+ arc_c_max = (arc_c * 8) - (1<<30);
+ else
+ arc_c_max = arc_c_min;
+ arc_c_max = MAX(arc_c * 6, arc_c_max);
+#ifdef notyet
+ /*
+ * Allow the tunables to override our calculations if they are
+ * reasonable (ie. over 64MB)
+ */
+ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
+ arc_c_max = zfs_arc_max;
+ if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
+ arc_c_min = zfs_arc_min;
+#endif
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_size = 0;
+
+ mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+
+ buf_init();
+
+ arc_thread_exit = 0;
+ arc_eviction_list = NULL;
+ mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
+ bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+
+ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (arc_ksp != NULL) {
+ arc_ksp->ks_data = &arc_stats;
+ kstat_install(arc_ksp);
+ }
+
+ (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
+#ifdef _KERNEL
+ zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL,
+ EVENTHANDLER_PRI_FIRST);
+#endif
+
+ arc_dead = FALSE;
+}
+
+void
+arc_fini(void)
+{
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_thread_exit = 1;
+ cv_signal(&arc_reclaim_thr_cv);
+ while (arc_thread_exit != 0)
+ cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ arc_flush();
+
+ arc_dead = TRUE;
+
+ if (arc_ksp != NULL) {
+ kstat_delete(arc_ksp);
+ arc_ksp = NULL;
+ }
+
+ mutex_destroy(&arc_eviction_mtx);
+ mutex_destroy(&arc_reclaim_thr_lock);
+ cv_destroy(&arc_reclaim_thr_cv);
+
+ list_destroy(&arc_mru->arcs_list);
+ list_destroy(&arc_mru_ghost->arcs_list);
+ list_destroy(&arc_mfu->arcs_list);
+ list_destroy(&arc_mfu_ghost->arcs_list);
+
+ mutex_destroy(&arc_anon->arcs_mtx);
+ mutex_destroy(&arc_mru->arcs_mtx);
+ mutex_destroy(&arc_mru_ghost->arcs_mtx);
+ mutex_destroy(&arc_mfu->arcs_mtx);
+ mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+
+ buf_fini();
+
+#ifdef _KERNEL
+ if (zfs_event_lowmem != NULL)
+ EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem);
+#endif
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c
new file mode 100644
index 000000000000..4442b1f28ac8
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,312 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+static int
+bplist_hold(bplist_t *bpl)
+{
+ ASSERT(MUTEX_HELD(&bpl->bpl_lock));
+ if (bpl->bpl_dbuf == NULL) {
+ int err = dmu_bonus_hold(bpl->bpl_mos,
+ bpl->bpl_object, bpl, &bpl->bpl_dbuf);
+ if (err)
+ return (err);
+ bpl->bpl_phys = bpl->bpl_dbuf->db_data;
+ }
+ return (0);
+}
+
+uint64_t
+bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
+ BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
+
+ return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
+ DMU_OT_BPLIST_HDR, size, tx));
+}
+
+void
+bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY(dmu_object_free(mos, object, tx) == 0);
+}
+
+int
+bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(mos, object, &doi);
+ if (err)
+ return (err);
+
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_dbuf == NULL);
+ ASSERT(bpl->bpl_phys == NULL);
+ ASSERT(bpl->bpl_cached_dbuf == NULL);
+ ASSERT(bpl->bpl_queue == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
+
+ bpl->bpl_mos = mos;
+ bpl->bpl_object = object;
+ bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
+ bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
+ bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
+
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+void
+bplist_close(bplist_t *bpl)
+{
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_queue == NULL);
+
+ if (bpl->bpl_cached_dbuf) {
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ bpl->bpl_cached_dbuf = NULL;
+ }
+ if (bpl->bpl_dbuf) {
+ dmu_buf_rele(bpl->bpl_dbuf, bpl);
+ bpl->bpl_dbuf = NULL;
+ bpl->bpl_phys = NULL;
+ }
+
+ mutex_exit(&bpl->bpl_lock);
+}
+
+boolean_t
+bplist_empty(bplist_t *bpl)
+{
+ boolean_t rv;
+
+ if (bpl->bpl_object == 0)
+ return (B_TRUE);
+
+ mutex_enter(&bpl->bpl_lock);
+ VERIFY(0 == bplist_hold(bpl)); /* XXX */
+ rv = (bpl->bpl_phys->bpl_entries == 0);
+ mutex_exit(&bpl->bpl_lock);
+
+ return (rv);
+}
+
+static int
+bplist_cache(bplist_t *bpl, uint64_t blkid)
+{
+ int err = 0;
+
+ if (bpl->bpl_cached_dbuf == NULL ||
+ bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
+ if (bpl->bpl_cached_dbuf != NULL)
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ err = dmu_buf_hold(bpl->bpl_mos,
+ bpl->bpl_object, blkid << bpl->bpl_blockshift,
+ bpl, &bpl->bpl_cached_dbuf);
+ ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
+ 1ULL << bpl->bpl_blockshift);
+ }
+ return (err);
+}
+
+int
+bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ int err;
+
+ mutex_enter(&bpl->bpl_lock);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ if (*itorp >= bpl->bpl_phys->bpl_entries) {
+ mutex_exit(&bpl->bpl_lock);
+ return (ENOENT);
+ }
+
+ blk = *itorp >> bpl->bpl_bpshift;
+ off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ *bp = bparray[off];
+ (*itorp)++;
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+int
+bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ int err;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ err = bplist_hold(bpl);
+ if (err)
+ return (err);
+
+ blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
+ off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
+
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ bparray[off] = *bp;
+
+ /* We never need the fill count. */
+ bparray[off].blk_fill = 0;
+
+ /* The bplist will compress better if we can leave off the checksum */
+ bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ bpl->bpl_phys->bpl_entries++;
+ bpl->bpl_phys->bpl_bytes +=
+ bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
+ if (bpl->bpl_havecomp) {
+ bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
+ bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpl->bpl_lock);
+
+ return (0);
+}
+
+/*
+ * Deferred entry; will be written later by bplist_sync().
+ */
+void
+bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+{
+ bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ bpq->bpq_blk = *bp;
+ bpq->bpq_next = bpl->bpl_queue;
+ bpl->bpl_queue = bpq;
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+{
+ bplist_q_t *bpq;
+
+ mutex_enter(&bpl->bpl_lock);
+ while ((bpq = bpl->bpl_queue) != NULL) {
+ bpl->bpl_queue = bpq->bpq_next;
+ mutex_exit(&bpl->bpl_lock);
+ VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
+ kmem_free(bpq, sizeof (*bpq));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
+{
+ mutex_enter(&bpl->bpl_lock);
+ ASSERT3P(bpl->bpl_queue, ==, NULL);
+ VERIFY(0 == bplist_hold(bpl));
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ VERIFY(0 == dmu_free_range(bpl->bpl_mos,
+ bpl->bpl_object, 0, -1ULL, tx));
+ bpl->bpl_phys->bpl_entries = 0;
+ bpl->bpl_phys->bpl_bytes = 0;
+ if (bpl->bpl_havecomp) {
+ bpl->bpl_phys->bpl_comp = 0;
+ bpl->bpl_phys->bpl_uncomp = 0;
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+int
+bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ uint64_t itor = 0, comp = 0, uncomp = 0;
+ int err;
+ blkptr_t bp;
+
+ mutex_enter(&bpl->bpl_lock);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ *usedp = bpl->bpl_phys->bpl_bytes;
+ if (bpl->bpl_havecomp) {
+ *compp = bpl->bpl_phys->bpl_comp;
+ *uncompp = bpl->bpl_phys->bpl_uncomp;
+ }
+ mutex_exit(&bpl->bpl_lock);
+
+ if (!bpl->bpl_havecomp) {
+ while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
+ comp += BP_GET_PSIZE(&bp);
+ uncomp += BP_GET_UCSIZE(&bp);
+ }
+ if (err == ENOENT)
+ err = 0;
+ *compp = comp;
+ *uncompp = uncomp;
+ }
+
+ return (err);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 000000000000..1fde66f6f2a1
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,2240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx);
+static arc_done_func_t dbuf_write_ready;
+static arc_done_func_t dbuf_write_done;
+
+int zfs_mdcomp_disable = 0;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ refcount_create(&db->db_holds);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ cv_destroy(&db->db_changed);
+ refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+ return (crc);
+}
+
+#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t obj = dn->dn_object;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *db;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid = db->db_blkid;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (dbf->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, 1);
+
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table. This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ /*
+ * We musn't hold db_mtx to maintin lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_state == DB_EVICTING);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static arc_evict_func_t dbuf_do_evict;
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level != 0 || db->db_evict_func == NULL)
+ return;
+
+ if (db->db_user_data_ptr_ptr)
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ db->db_evict_func(&db->db, db->db_user_ptr);
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+}
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ dbuf_clear(db);
+ dbuf_destroy(db);
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1ULL << 16;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 4K block size. The table will take up
+ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
+ */
+ while (hsize * 4096 < physmem * PAGESIZE)
+ hsize <<= 1;
+
+retry:
+ h->hash_table_mask = hsize - 1;
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+ kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ list_head(&dn->dn_dbufs));
+ }
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ if (db->db_level == 0) {
+ /* we can be momentarily larger in dnode_set_blksz() */
+ if (db->db_blkid != DB_BONUS_BLKID && dn) {
+ ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
+ }
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ /*
+ * it should only be modified in syncing
+ * context, so make sure we only have
+ * one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+ }
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the struct_rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ */
+ if (db->db_dirtycnt == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ }
+ }
+}
+#endif
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ }
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+ db->db_buf = buf;
+ if (buf != NULL) {
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+ if (!arc_released(buf))
+ arc_set_callback(buf, dbuf_do_evict, db);
+ dbuf_update_data(db);
+ } else {
+ dbuf_evict_user(db);
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+ if (dn->dn_datablkshift) {
+ return (offset >> dn->dn_datablkshift);
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* we were freed in flight; disregard any error */
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ arc_buf_freeze(buf);
+ db->db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else if (zio == NULL || zio->io_error == 0) {
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ db->db_state = DB_UNCACHED;
+ }
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, NULL);
+}
+
+static void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+{
+ blkptr_t *bp;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_NOWAIT;
+
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ /* We need the struct_rwlock to prevent db_blkptr from changing. */
+ ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ if (db->db.db_size < DN_MAX_BONUSLEN)
+ bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
+ db->db.db_size);
+ dbuf_update_data(db);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
+ bp = NULL;
+ else
+ bp = db->db_blkptr;
+
+ if (bp == NULL)
+ dprintf_dbuf(db, "blkptr: %s\n", "NULL");
+ else
+ dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
+
+ if (bp == NULL || BP_IS_HOLE(bp)) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(bp == NULL || BP_IS_HOLE(bp));
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_state = DB_CACHED;
+ *flags |= DB_RF_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ db->db_state = DB_READ;
+ mutex_exit(&db->db_mtx);
+
+ zb.zb_objset = db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ dbuf_add_ref(db, NULL);
+ /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+ ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
+ (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
+ db->db_level > 0 ? byteswap_uint64_array :
+ dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+ (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+ if (aflags & ARC_CACHED)
+ *flags |= DB_RF_CACHED;
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ int err = 0;
+ int havepzio = (zio != NULL);
+ int prefetch;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+
+ prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ } else if (db->db_state == DB_UNCACHED) {
+ if (zio == NULL) {
+ zio = zio_root(db->db_dnode->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ }
+ dbuf_read_impl(db, zio, &flags);
+
+ /* dbuf_read_impl has dropped db_mtx for us */
+
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, flags & DB_RF_CACHED);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ if (!havepzio)
+ err = zio_wait(zio);
+ } else {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ mutex_enter(&db->db_mtx);
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ }
+ mutex_exit(&db->db_mtx);
+ }
+
+ ASSERT(err || havepzio || db->db_state == DB_CACHED);
+ return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ db->db_state = DB_FILL;
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dr->dt.dl.dr_data = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, size, db, type);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ dbuf_set_data(db, NULL);
+ }
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
+
+ /* free this block */
+ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
+ /* XXX can get silent EIO here */
+ (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
+ }
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
+}
+
+void
+dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ if (db->db_level != 0)
+ continue;
+ dprintf_dbuf(db, "found buf %s\n", "");
+ if (db->db_blkid < blkid ||
+ db->db_blkid >= blkid+nblks)
+ continue;
+
+ /* found a level 0 buffer in the range */
+ if (dbuf_undirty(db, tx))
+ continue;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_EVICTING) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
+ db->db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_clear(db);
+ continue;
+ }
+ /* The dbuf is referenced */
+
+ if (db->db_last_dirty != NULL) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ if (dr->dr_txg == txg) {
+ /*
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
+ */
+ if (db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ }
+ }
+ /* clear the contents if its cached */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ arc_release(db->db_buf, db);
+ bzero(db->db.db_data, db->db.db_size);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_new_block(dmu_buf_impl_t *db)
+{
+ dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+ uint64_t birth_txg = 0;
+
+ /* Don't count meta-objects */
+ if (ds == NULL)
+ return (FALSE);
+
+ /*
+ * We don't need any locking to protect db_blkptr:
+ * If it's syncing, then db_last_dirty will be set
+ * so we'll ignore db_blkptr.
+ */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ /* If we have been dirtied since the last snapshot, its not new */
+ if (db->db_last_dirty)
+ birth_txg = db->db_last_dirty->dr_txg;
+ else if (db->db_blkptr)
+ birth_txg = db->db_blkptr->blk_birth;
+
+ if (birth_txg)
+ return (!dsl_dataset_block_freeable(ds, birth_txg));
+ else
+ return (TRUE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *obuf;
+ int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ /* XXX does *this* func really need the lock? */
+ ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+
+ /*
+ * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+ * is OK, because there can be no other references to the db
+ * when we are changing its size, so no concurrent DB_FILL can
+ * be happening.
+ */
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
+ dbuf_will_dirty(db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+
+ /* copy old block data to the new block */
+ obuf = db->db_buf;
+ bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
+ /* zero the remainder */
+ if (size > osize)
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_set_data(db, buf);
+ VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+ db->db.db_size = size;
+
+ if (db->db_level == 0) {
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ db->db_last_dirty->dt.dl.dr_data = buf;
+ }
+ mutex_exit(&db->db_mtx);
+
+ dnode_willuse_space(db->db_dnode, size-osize, tx);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dbuf_dirty_record_t **drp, *dr;
+ int drop_struct_lock = FALSE;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ DMU_TX_DIRTY_BUF(tx, db);
+
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ * XXX We may want to prohibit dirtying in syncing context even
+ * if they did pre-dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_objset->os_dsl_dataset == NULL ||
+ dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
+
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
+ */
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL);
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+ !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx =
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ drp = &db->db_last_dirty;
+ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ while (*drp && (*drp)->dr_txg > tx->tx_txg)
+ drp = &(*drp)->dr_next;
+ if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(*drp);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ arc_buf_thaw(db->db_buf);
+ }
+ mutex_exit(&db->db_mtx);
+ return (*drp);
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos, a spa os, or we're initializing the os. However, we are
+ * allowed to dirty in syncing context provided we already
+ * dirtied it in open context. Hence we must make this
+ * assertion only if we're not already dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ os->os_dsl_dataset == NULL ||
+ !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
+ !BP_IS_HOLE(os->os_rootbp));
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so that we
+ * can modify it without impacting possible other users
+ * of this cached data block. Note that indirect
+ * blocks and private objects are not released until the
+ * syncing state (since they are only modified then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ dr->dr_next = *drp;
+ *drp = dr;
+
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
+ }
+
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ */
+ if (!dbuf_new_block(db) && db->db_blkptr) {
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn,
+ -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
+ }
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ dnode_setdirty(dn, tx);
+ return (dr);
+ }
+
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (db->db_level+1 < dn->dn_nlevels) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ parent_held = TRUE;
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3U(db->db_level+1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /* possible race with dbuf_undirty() */
+ if (db->db_last_dirty == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
+ } else {
+ ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL ||
+ db->db_parent == db->db_dnode->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+ return (dr);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ uint64_t txg = tx->tx_txg;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT(txg != 0);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ mutex_enter(&db->db_mtx);
+
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+ if (dr->dr_txg <= txg)
+ break;
+ if (dr == NULL || dr->dr_txg < txg) {
+ mutex_exit(&db->db_mtx);
+ return (0);
+ }
+ ASSERT(dr->dr_txg == txg);
+
+ /*
+ * If this buffer is currently held, we cannot undirty
+ * it, since one of the current holders may be in the
+ * middle of an update. Note that users of dbuf_undirty()
+ * should not place a hold on the dbuf before the call.
+ */
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ /* Make sure we don't toss this buffer at sync phase */
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ ASSERT(db->db.db_size != 0);
+
+ /* XXX would be nice to fix up dn_towrite_space[] */
+
+ db->db_last_dirty = dr->dr_next;
+
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_level+1 == dn->dn_nlevels) {
+ ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (db->db_level == 0) {
+ dbuf_unoverride(dr);
+
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ } else {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ /* XXX - mutex and list destroy? */
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+ arc_buf_t *buf = db->db_buf;
+
+ ASSERT(arc_released(buf));
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ return (1);
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ int rf = DB_RF_MUST_SUCCEED;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ if (db->db_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_freed_in_flight = FALSE;
+ }
+ db->db_state = DB_CACHED;
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * "Clear" the contents of this dbuf. This will mark the dbuf
+ * EVICTING and clear *most* of its references. Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
+ * in this case. For callers from the DMU we will usually see:
+ * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ * DMU: dbuf_clear()->arc_buf_evict()
+ * ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+ dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ int dbuf_gone = FALSE;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ dbuf_evict_user(db);
+
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ if (db->db_blkid == DB_BONUS_BLKID)
+ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_data_pending == NULL);
+
+ db->db_state = DB_EVICTING;
+ db->db_blkptr = NULL;
+
+ if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ list_remove(&dn->dn_dbufs, db);
+ dnode_rele(dn, db);
+ }
+
+ if (db->db_buf)
+ dbuf_gone = arc_buf_evict(db->db_buf);
+
+ if (!dbuf_gone)
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * If this dbuf is referened from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (parent && parent != dndb)
+ dbuf_rele(parent, db);
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+ int nlevels, epbs;
+
+ *parentp = NULL;
+ *bpp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+
+ if (dn->dn_phys->dn_nlevels == 0)
+ nlevels = 1;
+ else
+ nlevels = dn->dn_phys->dn_nlevels;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ if (level >= nlevels ||
+ (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ return (ENOENT);
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err = dbuf_hold_impl(dn, level+1,
+ blkid >> epbs, fail_sparse, NULL, parentp);
+ if (err)
+ return (err);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
+ }
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt = 0;
+ db->db_dnode = dn;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+ db->db_immediate_evict = 0;
+ db->db_freed_in_flight = 0;
+
+ if (blkid == DB_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
+ db->db.db_size = dn->dn_bonuslen;
+ db->db.db_offset = DB_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ /* the bonus dbuf is not placed in the hash table */
+ return (db);
+ } else {
+ int blocksize =
+ db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before its added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING;
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return (odb);
+ }
+ list_insert_head(&dn->dn_dbufs, db);
+ db->db_state = DB_UNCACHED;
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, db);
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+static int
+dbuf_do_evict(void *private)
+{
+ arc_buf_t *buf = private;
+ dmu_buf_impl_t *db = buf->b_private;
+
+ if (!MUTEX_HELD(&db->db_mtx))
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_state != DB_EVICTING) {
+ ASSERT(db->db_state == DB_CACHED);
+ DBUF_VERIFY(db);
+ db->db_buf = NULL;
+ dbuf_evict(db);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dbuf_destroy(db);
+ }
+ return (0);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ dnode_t *dn = db->db_dnode;
+
+ /*
+ * If this dbuf is still on the dn_dbufs list,
+ * remove it from that list.
+ */
+ if (list_link_active(&db->db_link)) {
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_remove(&dn->dn_dbufs, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ dnode_rele(dn, db);
+ }
+ dbuf_hash_remove(db);
+ }
+ db->db_parent = NULL;
+ db->db_dnode = NULL;
+ db->db_buf = NULL;
+
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ kmem_cache_free(dbuf_cache, db);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+ dmu_buf_impl_t *db = NULL;
+ blkptr_t *bp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (dnode_block_freed(dn, blkid))
+ return;
+
+ /* dbuf_find() returns with db_mtx held */
+ if (db = dbuf_find(dn, 0, blkid)) {
+ if (refcount_count(&db->db_holds) > 0) {
+ /*
+ * This dbuf is active. We assume that it is
+ * already CACHED, or else about to be either
+ * read or filled.
+ */
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ mutex_exit(&db->db_mtx);
+ db = NULL;
+ }
+
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
+ if (bp && !BP_IS_HOLE(bp)) {
+ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+ zbookmark_t zb;
+ zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
+ dn->dn_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = dn->dn_object;
+ zb.zb_level = 0;
+ zb.zb_blkid = blkid;
+
+ (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
+ dmu_ot[dn->dn_type].ot_byteswap,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zb);
+ }
+ if (db)
+ dbuf_rele(db, NULL);
+ }
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(dn->dn_nlevels, >, level);
+
+ *dbp = NULL;
+top:
+ /* dbuf_find() returns with db_mtx held */
+ db = dbuf_find(dn, level, blkid);
+
+ if (db == NULL) {
+ blkptr_t *bp = NULL;
+ int err;
+
+ ASSERT3P(parent, ==, NULL);
+ err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ if (fail_sparse) {
+ if (err == 0 && bp && BP_IS_HOLE(bp))
+ err = ENOENT;
+ if (err) {
+ if (parent)
+ dbuf_rele(parent, NULL);
+ return (err);
+ }
+ }
+ if (err && err != ENOENT)
+ return (err);
+ db = dbuf_create(dn, level, blkid, parent, bp);
+ }
+
+ if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+ arc_buf_add_ref(db->db_buf, db);
+ if (db->db_buf->b_data == NULL) {
+ dbuf_clear(db);
+ if (parent) {
+ dbuf_rele(parent, NULL);
+ parent = NULL;
+ }
+ goto top;
+ }
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ }
+
+ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
+ /*
+ * If this buffer is currently syncing out, and we are are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ db->db_state == DB_CACHED && db->db_data_pending) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ dbuf_set_data(db,
+ arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
+ db->db.db_size);
+ }
+ }
+
+ (void) refcount_add(&db->db_holds, tag);
+ dbuf_update_data(db);
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (parent)
+ dbuf_rele(parent, NULL);
+
+ ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3U(db->db_blkid, ==, blkid);
+ ASSERT3U(db->db_level, ==, level);
+ *dbp = db;
+
+ return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_create_bonus(dnode_t *dn)
+{
+ dmu_buf_impl_t *db = dn->dn_bonus;
+
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+ return (db);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds = refcount_add(&db->db_holds, tag);
+ ASSERT(holds > 1);
+}
+
+#pragma weak dmu_buf_rele = dbuf_rele
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds;
+
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ holds = refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+ arc_buf_freeze(db->db_buf);
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_immediate_evict)
+ dbuf_evict_user(db);
+
+ if (holds == 0) {
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_exit(&db->db_mtx);
+ dnode_rele(db->db_dnode, db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ dbuf_evict(db);
+ } else if (arc_released(db->db_buf)) {
+ arc_buf_t *buf = db->db_buf;
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ } else {
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+ mutex_exit(&db->db_mtx);
+ }
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_immediate_evict = TRUE;
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(db->db_level == 0);
+
+ ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_user_ptr == old_user_ptr) {
+ db->db_user_ptr = user_ptr;
+ db->db_user_data_ptr_ptr = user_data_ptr_ptr;
+ db->db_evict_func = evict_func;
+
+ dbuf_update_data(db);
+ } else {
+ old_user_ptr = db->db_user_ptr;
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ return (db->db_user_ptr);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, db->db_level+1,
+ db->db_blkid >> epbs, FALSE, db, &parent);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
+}
+
+static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ zio_t *zio;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ ASSERT(db->db_buf != NULL);
+
+ dbuf_check_blkptr(dn, db);
+
+ db->db_data_pending = dr;
+
+ arc_release(db->db_buf, db);
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
+ zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+
+ zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+ int checksum, compress;
+ int blksz;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ DBUF_VERIFY(db);
+
+ /*
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
+ */
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_dirty_record_t **drp;
+ /*
+ * Use dn_phys->dn_bonuslen since db.db_size is the length
+ * of the bonus buffer in the open transaction rather than
+ * the syncing transaction.
+ */
+ ASSERT(*datap != NULL);
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ if (*datap != db->db.db_data)
+ zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ db->db_data_pending = NULL;
+ drp = &db->db_last_dirty;
+ while (*drp != dr)
+ drp = &(*drp)->dr_next;
+ ASSERT((*drp)->dr_next == NULL);
+ *drp = NULL;
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ return;
+ }
+
+ /*
+ * If this buffer is in the middle of an immdiate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this dbuf has already been written out via an immediate write,
+ * just complete the write by copying over the new block pointer and
+ * updating the accounting via the write-completion functions.
+ */
+ if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ zio_t zio_fake;
+
+ zio_fake.io_private = &db;
+ zio_fake.io_error = 0;
+ zio_fake.io_bp = db->db_blkptr;
+ zio_fake.io_bp_orig = *db->db_blkptr;
+ zio_fake.io_txg = txg;
+
+ *db->db_blkptr = dr->dt.dl.dr_overridden_by;
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ db->db_data_pending = dr;
+ dr->dr_zio = &zio_fake;
+ mutex_exit(&db->db_mtx);
+
+ if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio_fake.io_bp_orig, dn->dn_zio, tx);
+
+ dbuf_write_ready(&zio_fake, db->db_buf, db);
+ dbuf_write_done(&zio_fake, db->db_buf, db);
+
+ return;
+ }
+
+ blksz = arc_buf_size(*datap);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * If this buffer is currently "in use" (i.e., there are
+ * active holds and db_data still references it), then make
+ * a copy before we start the write so that any modifications
+ * from the open txg will not leak into this write.
+ *
+ * NOTE: this copy does not need to be made for objects only
+ * modified in the syncing context (e.g. DNONE_DNODE blocks).
+ */
+ if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ }
+ } else {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(db->db_buf, db);
+ }
+
+ ASSERT(*datap != NULL);
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * Allow dnode settings to override objset settings,
+ * except for metadata checksums.
+ */
+ if (dmu_ot[dn->dn_type].ot_metadata) {
+ checksum = os->os_md_checksum;
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_md_compress);
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum,
+ os->os_checksum);
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_compress);
+ }
+
+ dbuf_write(dr, *datap, checksum, compress, tx);
+
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+ else
+ zio_nowait(dr->dr_zio);
+}
+
+void
+dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
+ }
+ list_remove(list, dr);
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_t *zio;
+ int zio_flags;
+
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ if (BP_IS_OLDER(db->db_blkptr, txg))
+ dsl_dataset_block_kill(
+ os->os_dsl_dataset, db->db_blkptr, zio, tx);
+
+ dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
+ dmu_get_replication_level(os, &zb, dn->dn_type), txg,
+ db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint64_t fill = 0;
+ int old_size, new_size, i;
+
+ dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
+
+ old_size = bp_get_dasize(os->os_spa, bp_orig);
+ new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+
+ dnode_diduse_space(dn, new_size-old_size);
+
+ if (BP_IS_HOLE(zio->io_bp)) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ return;
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_level == 0) {
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ dnode_phys_t *dnp = db->db.db_data;
+ for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+ i--, dnp++) {
+ if (dnp->dn_type != DMU_OT_NONE)
+ fill++;
+ }
+ } else {
+ fill = 1;
+ }
+ } else {
+ blkptr_t *bp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ ASSERT3U(BP_GET_LSIZE(bp), ==,
+ db->db_level == 1 ? dn->dn_datablksz :
+ (1<<dn->dn_phys->dn_indblkshift));
+ fill += bp->blk_fill;
+ }
+ }
+
+ db->db_blkptr->blk_fill = fill;
+ BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+ BP_SET_LEVEL(db->db_blkptr, db->db_level);
+
+ mutex_exit(&db->db_mtx);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ uint64_t txg = zio->io_txg;
+ dbuf_dirty_record_t **drp, *dr;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ mutex_enter(&db->db_mtx);
+
+ drp = &db->db_last_dirty;
+ while (*drp != db->db_data_pending)
+ drp = &(*drp)->dr_next;
+ ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
+ ASSERT((*drp)->dr_txg == txg);
+ ASSERT((*drp)->dr_next == NULL);
+ dr = *drp;
+ *drp = NULL;
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
+ } else {
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs =
+ dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ ASSERT3U(dn->dn_phys->dn_maxblkid
+ >> (db->db_level * epbs), >=, db->db_blkid);
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
+ mutex_exit(&db->db_mtx);
+
+ dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c
new file mode 100644
index 000000000000..d3be6b4ff22e
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,1029 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ { byteswap_uint8_array, TRUE, "unallocated" },
+ { zap_byteswap, TRUE, "object directory" },
+ { byteswap_uint64_array, TRUE, "object array" },
+ { byteswap_uint8_array, TRUE, "packed nvlist" },
+ { byteswap_uint64_array, TRUE, "packed nvlist size" },
+ { byteswap_uint64_array, TRUE, "bplist" },
+ { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "SPA space map header" },
+ { byteswap_uint64_array, TRUE, "SPA space map" },
+ { byteswap_uint64_array, TRUE, "ZIL intent log" },
+ { dnode_buf_byteswap, TRUE, "DMU dnode" },
+ { dmu_objset_byteswap, TRUE, "DMU objset" },
+ { byteswap_uint64_array, TRUE, "DSL directory" },
+ { zap_byteswap, TRUE, "DSL directory child map"},
+ { zap_byteswap, TRUE, "DSL dataset snap map" },
+ { zap_byteswap, TRUE, "DSL props" },
+ { byteswap_uint64_array, TRUE, "DSL dataset" },
+ { zfs_znode_byteswap, TRUE, "ZFS znode" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, FALSE, "ZFS plain file" },
+ { zap_byteswap, TRUE, "ZFS directory" },
+ { zap_byteswap, TRUE, "ZFS master node" },
+ { zap_byteswap, TRUE, "ZFS delete queue" },
+ { byteswap_uint8_array, FALSE, "zvol object" },
+ { zap_byteswap, TRUE, "zvol prop" },
+ { byteswap_uint8_array, FALSE, "other uint8[]" },
+ { byteswap_uint64_array, FALSE, "other uint64[]" },
+ { zap_byteswap, TRUE, "other ZAP" },
+ { zap_byteswap, TRUE, "persistent error log" },
+ { byteswap_uint8_array, TRUE, "SPA history" },
+ { byteswap_uint64_array, TRUE, "SPA history offsets" },
+ { zap_byteswap, TRUE, "Pool properties" },
+};
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ blkid = dbuf_whichblock(dn, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL) {
+ err = EIO;
+ } else {
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, tag);
+ db = NULL;
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_MAX_BONUSLEN);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ */
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ int err, count;
+ dmu_buf_impl_t *db;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dn->dn_bonus = dbuf_create_bonus(dn);
+ }
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ count = refcount_add(&db->db_holds, tag);
+ mutex_exit(&db->db_mtx);
+ if (count == 1)
+ dnode_add_ref(dn, db);
+ dnode_rele(dn, FTAG);
+
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+
+ *dbp = &db->db;
+ return (0);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+static int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+ uint32_t flags;
+ int err;
+ zio_t *zio;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+ if (length > zfetch_array_rd_sz)
+ flags |= DB_RF_NOPREFETCH;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+ P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+ } else {
+ if (offset + length > dn->dn_datablksz) {
+ zfs_panic_recover("zfs: accessing past end of object "
+ "%llx/%llx (size=%u access=%llu+%llu)",
+ (longlong_t)dn->dn_objset->
+ os_dsl_dataset->ds_object,
+ (longlong_t)dn->dn_object, dn->dn_datablksz,
+ (longlong_t)offset, (longlong_t)length);
+ return (EIO);
+ }
+ nblks = 1;
+ }
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ zio_nowait(zio);
+ return (EIO);
+ }
+ /* initiate async i/o */
+ if (read) {
+ rw_exit(&dn->dn_struct_rwlock);
+ (void) dbuf_read(db, zio, flags);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ }
+ dbp[i] = &db->db;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* wait for async i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
+ if (read) {
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
+
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ int err;
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, i, err;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = os->os->os_meta_dnode;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, blkid);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+ P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, blkid+i);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf)
+{
+ dnode_t *dn;
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ /*
+ * Deal with odd block sizes, where there can't be data past the first
+ * block. If we ever do the tail block optimization, we will need to
+ * handle that here as well.
+ */
+ if (dn->dn_datablkshift == 0) {
+ int newsz = offset > dn->dn_datablksz ? 0 :
+ MIN(size, dn->dn_datablksz - offset);
+ bzero((char *)buf + newsz, size - newsz);
+ size = newsz;
+ }
+
+ while (size > 0) {
+ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+ int err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+ TRUE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
+ &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err = 0;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * XXX uiomove could block forever (eg. nfs-backed
+ * pages). There needs to be a uiolockdown() function
+ * to lock the pages in memory, so that uiomove won't
+ * block.
+ */
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_WRITE, uio);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+#ifndef __FreeBSD__
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ page_t *pp, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ ppmapout(va);
+ pp = pp->p_next;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+#endif /* !__FreeBSD__ */
+#endif /* _KERNEL */
+
+typedef struct {
+ dbuf_dirty_record_t *dr;
+ dmu_sync_cb_t *done;
+ void *arg;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dmu_sync_cb_t *done = in->done;
+
+ if (!BP_IS_HOLE(zio->io_bp)) {
+ zio->io_bp->blk_fill = 1;
+ BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
+ BP_SET_LEVEL(zio->io_bp, 0);
+ }
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+ dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+
+ if (done)
+ done(&(db->db), in->arg);
+
+ kmem_free(in, sizeof (dmu_sync_arg_t));
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ * EEXIST: this txg has already been synced, so there's nothing to to.
+ * The caller should not log the write.
+ *
+ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * EALREADY: this block is already in the process of being synced.
+ * The caller should track its progress (somehow).
+ *
+ * EINPROGRESS: the IO has been initiated.
+ * The caller should log this blkptr in the callback.
+ *
+ * 0: completed. Sets *bp to the blkptr just written.
+ * The caller should log this blkptr immediately.
+ */
+int
+dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
+ blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ objset_impl_t *os = db->db_objset;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+ tx_state_t *tx = &dp->dp_tx;
+ dbuf_dirty_record_t *dr;
+ dmu_sync_arg_t *in;
+ zbookmark_t zb;
+ zio_t *zio;
+ int zio_flags;
+ int err;
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT(txg != 0);
+
+
+ dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
+ txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+
+ /*
+ * XXX - would be nice if we could do this without suspending...
+ */
+ txg_suspend(dp);
+
+ /*
+ * If this txg already synced, there's nothing to do.
+ */
+ if (txg <= tx->tx_synced_txg) {
+ txg_resume(dp);
+ /*
+ * If we're running ziltest, we need the blkptr regardless.
+ */
+ if (txg > spa_freeze_txg(dp->dp_spa)) {
+ /* if db_blkptr == NULL, this was an empty write */
+ if (db->db_blkptr)
+ *bp = *db->db_blkptr; /* structure assignment */
+ return (0);
+ }
+ return (EEXIST);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (txg == tx->tx_syncing_txg) {
+ while (db->db_data_pending) {
+ /*
+ * IO is in-progress. Wait for it to finish.
+ * XXX - would be nice to be able to somehow "attach"
+ * this zio to the parent zio passed in.
+ */
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (!db->db_data_pending &&
+ db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
+ /*
+ * IO was compressed away
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ ASSERT(db->db_data_pending ||
+ (db->db_blkptr && db->db_blkptr->blk_birth == txg));
+ }
+
+ if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
+ /*
+ * IO is already completed.
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ }
+
+ dr = db->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ if (dr == NULL || dr->dr_txg < txg) {
+ /*
+ * This dbuf isn't dirty, must have been free_range'd.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (ENOENT);
+ }
+
+ ASSERT(dr->dr_txg == txg);
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ /*
+ * We have already issued a sync write for this buffer.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (EALREADY);
+ } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * This buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
+ */
+ *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+
+ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+ in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ in->dr = dr;
+ in->done = done;
+ in->arg = arg;
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+
+ zb.zb_objset = os->os_dsl_dataset->ds_object;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ zio = arc_write(pio, os->os_spa,
+ zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
+ zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
+ dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
+ txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+ if (pio) {
+ zio_nowait(zio);
+ err = EINPROGRESS;
+ } else {
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ }
+ return (err);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ dn->dn_checksum = checksum;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
+ ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ dn->dn_compress = compress;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_get_replication_level(objset_impl_t *os,
+ zbookmark_t *zb, dmu_object_type_t ot)
+{
+ int ncopies = os->os_copies;
+
+ /* If it's the mos, it should have max copies set. */
+ ASSERT(zb->zb_objset != 0 ||
+ ncopies == spa_max_replication(os->os_spa));
+
+ if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
+ ncopies++;
+ return (MIN(ncopies, spa_max_replication(os->os_spa)));
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+ dnode_t *dn;
+ int i, err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ /*
+ * Sync any current changes before
+ * we go trundling through the block pointers.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (list_link_active(&dn->dn_dirty_link[i]))
+ break;
+ }
+ if (i != TXG_SIZE) {
+ dnode_rele(dn, FTAG);
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ }
+
+ err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ doi->doi_data_block_size = dn->dn_datablksz;
+ doi->doi_metadata_block_size = dn->dn_indblkshift ?
+ 1ULL << dn->dn_indblkshift : 0;
+ doi->doi_indirection = dn->dn_nlevels;
+ doi->doi_checksum = dn->dn_checksum;
+ doi->doi_compress = dn->dn_compress;
+ doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
+ SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
+ doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_bonus_type = dn->dn_bonustype;
+
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+
+ if (err)
+ return (err);
+
+ if (doi != NULL)
+ dmu_object_info_from_dnode(dn, doi);
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ *blksize = dn->dn_datablksz;
+ /* add 1 for dnode space */
+ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+ SPA_MINBLOCKSHIFT) + 1;
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+ uint32_t *buf = vbuf;
+ size_t count = size >> 2;
+ int i;
+
+ ASSERT((size & 3) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+ uint16_t *buf = vbuf;
+ size_t count = size >> 1;
+ int i;
+
+ ASSERT((size & 1) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+ dbuf_init();
+ dnode_init();
+ arc_init();
+}
+
+void
+dmu_fini(void)
+{
+ arc_fini();
+ dnode_fini();
+ dbuf_fini();
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
new file mode 100644
index 000000000000..93168cc8901f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -0,0 +1,160 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ objset_impl_t *osi = os->os;
+ uint64_t object;
+ uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+ (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ dnode_t *dn = NULL;
+ int restarted = B_FALSE;
+
+ mutex_enter(&osi->os_obj_lock);
+ for (;;) {
+ object = osi->os_obj_next;
+ /*
+ * Each time we polish off an L2 bp worth of dnodes
+ * (2^13 objects), move to another L2 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning once, but after that keep looking from here.
+ * If we can't find one, just keep going from here.
+ */
+ if (P2PHASE(object, L2_dnode_count) == 0) {
+ uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
+ int error = dnode_next_offset(osi->os_meta_dnode,
+ B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ restarted = B_TRUE;
+ if (error == 0)
+ object = offset >> DNODE_SHIFT;
+ }
+ osi->os_obj_next = ++object;
+
+ /*
+ * XXX We should check for an i/o error here and return
+ * up to our caller. Actually we should pre-read it in
+ * dmu_tx_assign(), but there is currently no mechanism
+ * to do so.
+ */
+ (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+ FTAG, &dn);
+ if (dn)
+ break;
+
+ if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
+ osi->os_obj_next = object - 1;
+ }
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ mutex_exit(&osi->os_obj_lock);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (object);
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
+{
+ uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ int error;
+
+ error = dnode_next_offset(os->os->os_meta_dnode,
+ hole, &offset, 0, DNODES_PER_BLOCK, txg);
+
+ *objectp = offset >> DNODE_SHIFT;
+
+ return (error);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
new file mode 100644
index 000000000000..07f8c864a460
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -0,0 +1,1034 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zvol.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio_checksum.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+ return (os->os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+ return (os->os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+ dsl_dataset_t *ds;
+
+ if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+ return (ds->ds_dir->dd_pool);
+ else
+ return (spa_get_dsl(os->os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+ return (os->os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+ return (os->os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+ dsl_dataset_name(os->os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+
+ return (ds ? ds->ds_object : 0);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+ osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+}
+
+static void
+copies_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval > 0);
+ ASSERT(newval <= spa_max_replication(osi->os_spa));
+
+ osi->os_copies = newval;
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+ objset_phys_t *osp = buf;
+
+ ASSERT(size == sizeof (objset_phys_t));
+ dnode_byteswap(&osp->os_meta_dnode);
+ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+ osp->os_type = BSWAP_64(osp->os_type);
+}
+
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ objset_impl_t **osip)
+{
+ objset_impl_t *winner, *osi;
+ int i, err, checksum;
+
+ osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
+ osi->os.os = osi;
+ osi->os_dsl_dataset = ds;
+ osi->os_spa = spa;
+ osi->os_rootbp = bp;
+ if (!BP_IS_HOLE(osi->os_rootbp)) {
+ uint32_t aflags = ARC_WAIT;
+ zbookmark_t zb;
+ zb.zb_objset = ds ? ds->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+
+ dprintf_bp(osi->os_rootbp, "reading %s", "");
+ err = arc_read(NULL, spa, osi->os_rootbp,
+ dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+ arc_getbuf_func, &osi->os_phys_buf,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
+ if (err) {
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
+ osi->os_phys = osi->os_phys_buf->b_data;
+ arc_release(osi->os_phys_buf, &osi->os_phys_buf);
+ } else {
+ osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+ &osi->os_phys_buf, ARC_BUFC_METADATA);
+ osi->os_phys = osi->os_phys_buf->b_data;
+ bzero(osi->os_phys, sizeof (objset_phys_t));
+ }
+
+ /*
+ * Note: the changed_cb will be called once before the register
+ * func returns, thus changing the checksum/compression from the
+ * default (fletcher2/off). Snapshots don't need to know, and
+ * registering would complicate clone promotion.
+ */
+ if (ds && ds->ds_phys->ds_num_children == 0) {
+ err = dsl_prop_register(ds, "checksum",
+ checksum_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "copies",
+ copies_changed_cb, osi);
+ if (err) {
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
+ &osi->os_phys_buf) == 1);
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
+ } else if (ds == NULL) {
+ /* It's the meta-objset. */
+ osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_compress = ZIO_COMPRESS_LZJB;
+ osi->os_copies = spa_max_replication(spa);
+ }
+
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
+ /*
+ * Metadata always gets compressed and checksummed.
+ * If the data checksum is multi-bit correctable, and it's not
+ * a ZBT-style checksum, then it's suitable for metadata as well.
+ * Otherwise, the metadata checksum defaults to fletcher4.
+ */
+ checksum = osi->os_checksum;
+
+ if (zio_checksum_table[checksum].ci_correctable &&
+ !zio_checksum_table[checksum].ci_zbt)
+ osi->os_md_checksum = checksum;
+ else
+ osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_md_compress = ZIO_COMPRESS_LZJB;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ }
+ list_create(&osi->os_dnodes, sizeof (dnode_t),
+ offsetof(dnode_t, dn_link));
+ list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ osi->os_meta_dnode = dnode_special_open(osi,
+ &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+
+ if (ds != NULL) {
+ winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
+ if (winner) {
+ dmu_objset_evict(ds, osi);
+ osi = winner;
+ }
+ }
+
+ *osip = osi;
+ return (0);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+ objset_t *os;
+ objset_impl_t *osi;
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dsl_dataset_open(name, mode, os, &ds);
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+
+ osi = dsl_dataset_get_user_ptr(ds);
+ if (osi == NULL) {
+ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+ ds, &ds->ds_phys->ds_bp, &osi);
+ if (err) {
+ dsl_dataset_close(ds, mode, os);
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+ }
+
+ os->os = osi;
+ os->os_mode = mode;
+
+ if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
+ dmu_objset_close(os);
+ return (EINVAL);
+ }
+ *osp = os;
+ return (0);
+}
+
+void
+dmu_objset_close(objset_t *os)
+{
+ dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+ kmem_free(os, sizeof (objset_t));
+}
+
+int
+dmu_objset_evict_dbufs(objset_t *os, int try)
+{
+ objset_impl_t *osi = os->os;
+ dnode_t *dn;
+
+ mutex_enter(&osi->os_lock);
+
+ /* process the mdn last, since the other dnodes have holds on it */
+ list_remove(&osi->os_dnodes, osi->os_meta_dnode);
+ list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+
+ /*
+ * Find the first dnode with holds. We have to do this dance
+ * because dnode_add_ref() only works if you already have a
+ * hold. If there are no holds then it has no dbufs so OK to
+ * skip.
+ */
+ for (dn = list_head(&osi->os_dnodes);
+ dn && refcount_is_zero(&dn->dn_holds);
+ dn = list_next(&osi->os_dnodes, dn))
+ continue;
+ if (dn)
+ dnode_add_ref(dn, FTAG);
+
+ while (dn) {
+ dnode_t *next_dn = dn;
+
+ do {
+ next_dn = list_next(&osi->os_dnodes, next_dn);
+ } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
+ if (next_dn)
+ dnode_add_ref(next_dn, FTAG);
+
+ mutex_exit(&osi->os_lock);
+ if (dnode_evict_dbufs(dn, try)) {
+ dnode_rele(dn, FTAG);
+ if (next_dn)
+ dnode_rele(next_dn, FTAG);
+ return (1);
+ }
+ dnode_rele(dn, FTAG);
+ mutex_enter(&osi->os_lock);
+ dn = next_dn;
+ }
+ mutex_exit(&osi->os_lock);
+ return (0);
+}
+
+void
+dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+{
+ objset_impl_t *osi = arg;
+ objset_t os;
+ int i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
+ ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
+ }
+
+ if (ds && ds->ds_phys->ds_num_children == 0) {
+ VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+ checksum_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "compression",
+ compression_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "copies",
+ copies_changed_cb, osi));
+ }
+
+ /*
+ * We should need only a single pass over the dnode list, since
+ * nothing can be added to the list at this point.
+ */
+ os.os = osi;
+ (void) dmu_objset_evict_dbufs(&os, 0);
+
+ ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+
+ dnode_special_close(osi->os_meta_dnode);
+ zil_free(osi->os_zil);
+
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
+ mutex_destroy(&osi->os_lock);
+ mutex_destroy(&osi->os_obj_lock);
+ kmem_free(osi, sizeof (objset_impl_t));
+}
+
+/* called from dsl for meta-objset */
+objset_impl_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
+{
+ objset_impl_t *osi;
+ dnode_t *mdn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+ mdn = osi->os_meta_dnode;
+
+ dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
+ DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+
+ /*
+ * We don't want to have to increase the meta-dnode's nlevels
+ * later, because then we could do it in quescing context while
+ * we are also accessing it in open context.
+ *
+ * This precaution is not necessary for the MOS (ds == NULL),
+ * because the MOS is only updated in syncing context.
+ * This is most fortunate: the MOS is the only objset that
+ * needs to be synced multiple times as spa_sync() iterates
+ * to convergence, so minimizing its dn_nlevels matters.
+ */
+ if (ds != NULL) {
+ int levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the meta-dnode
+ * to contain DN_MAX_OBJECT dnodes.
+ */
+ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+ (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT * sizeof (dnode_phys_t))
+ levels++;
+
+ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+ mdn->dn_nlevels = levels;
+ }
+
+ ASSERT(type != DMU_OST_NONE);
+ ASSERT(type != DMU_OST_ANY);
+ ASSERT(type < DMU_OST_NUMTYPES);
+ osi->os_phys->os_type = type;
+
+ dsl_dataset_dirty(ds, tx);
+
+ return (osi);
+}
+
+struct oscarg {
+ void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ void *userarg;
+ dsl_dataset_t *clone_parent;
+ const char *lastname;
+ dmu_objset_type_t type;
+};
+
+/* ARGSUSED */
+static int
+dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct oscarg *oa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ int err;
+ uint64_t ddobj;
+
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ oa->lastname, sizeof (uint64_t), 1, &ddobj);
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
+
+ if (oa->clone_parent != NULL) {
+ /*
+ * You can't clone across pools.
+ */
+ if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+ return (EXDEV);
+
+ /*
+ * You can only clone snapshots, not the head datasets.
+ */
+ if (oa->clone_parent->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static void
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct oscarg *oa = arg2;
+ dsl_dataset_t *ds;
+ blkptr_t *bp;
+ uint64_t dsobj;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsobj = dsl_dataset_create_sync(dd, oa->lastname,
+ oa->clone_parent, tx);
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
+ bp = dsl_dataset_get_blkptr(ds);
+ if (BP_IS_HOLE(bp)) {
+ objset_impl_t *osi;
+
+ /* This is an empty dmu_objset; not a clone. */
+ osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, bp, oa->type, tx);
+
+ if (oa->userfunc)
+ oa->userfunc(&osi->os, oa->userarg, tx);
+ }
+ dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+{
+ dsl_dir_t *pdd;
+ const char *tail;
+ int err = 0;
+ struct oscarg oa = { 0 };
+
+ ASSERT(strchr(name, '@') == NULL);
+ err = dsl_dir_open(name, FTAG, &pdd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ dsl_dir_close(pdd, FTAG);
+ return (EEXIST);
+ }
+
+ dprintf("name=%s\n", name);
+
+ oa.userfunc = func;
+ oa.userarg = arg;
+ oa.lastname = tail;
+ oa.type = type;
+ if (clone_parent != NULL) {
+ /*
+ * You can't clone to a different type.
+ */
+ if (clone_parent->os->os_phys->os_type != type) {
+ dsl_dir_close(pdd, FTAG);
+ return (EINVAL);
+ }
+ oa.clone_parent = clone_parent->os->os_dsl_dataset;
+ }
+ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+ dmu_objset_create_sync, pdd, &oa, 5);
+ dsl_dir_close(pdd, FTAG);
+ return (err);
+}
+
+int
+dmu_objset_destroy(const char *name)
+{
+ objset_t *os;
+ int error;
+
+ /*
+ * If it looks like we'll be able to destroy it, and there's
+ * an unplayed replay log sitting around, destroy the log.
+ * It would be nicer to do this in dsl_dataset_destroy_sync(),
+ * but the replay log objset is modified in open context.
+ */
+ error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+ if (error == 0) {
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ dmu_objset_close(os);
+ }
+
+ return (dsl_dataset_destroy(name));
+}
+
+int
+dmu_objset_rollback(const char *name)
+{
+ int err;
+ objset_t *os;
+
+ err = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
+ if (err == 0) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0)
+ zil_resume(dmu_objset_zil(os));
+ if (err == 0) {
+ /* XXX uncache everything? */
+ err = dsl_dataset_rollback(os->os->os_dsl_dataset);
+ }
+ dmu_objset_close(os);
+ }
+ return (err);
+}
+
+struct snaparg {
+ dsl_sync_task_group_t *dstg;
+ char *snapname;
+ char failed[MAXPATHLEN];
+};
+
+static int
+dmu_objset_snapshot_one(char *name, void *arg)
+{
+ struct snaparg *sn = arg;
+ objset_t *os;
+ dmu_objset_stats_t stat;
+ int err;
+
+ (void) strcpy(sn->failed, name);
+
+ err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (err != 0)
+ return (err);
+
+ /*
+ * If the objset is in an inconsistent state, return busy.
+ */
+ dmu_objset_fast_stat(os, &stat);
+ if (stat.dds_inconsistent) {
+ dmu_objset_close(os);
+ return (EBUSY);
+ }
+
+ /*
+ * NB: we need to wait for all in-flight changes to get to disk,
+ * so that we snapshot those changes. zil_suspend does this as
+ * a side effect.
+ */
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0) {
+ dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, os, sn->snapname, 3);
+ } else {
+ dmu_objset_close(os);
+ }
+
+ return (err);
+}
+
+int
+dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
+{
+ dsl_sync_task_t *dst;
+ struct snaparg sn = { 0 };
+ char *cp;
+ spa_t *spa;
+ int err;
+
+ (void) strcpy(sn.failed, fsname);
+
+ cp = strchr(fsname, '/');
+ if (cp) {
+ *cp = '\0';
+ err = spa_open(fsname, &spa, FTAG);
+ *cp = '/';
+ } else {
+ err = spa_open(fsname, &spa, FTAG);
+ }
+ if (err)
+ return (err);
+
+ sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ sn.snapname = snapname;
+
+ if (recursive) {
+ err = dmu_objset_find(fsname,
+ dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
+ } else {
+ err = dmu_objset_snapshot_one(fsname, &sn);
+ }
+
+ if (err)
+ goto out;
+
+ err = dsl_sync_task_group_wait(sn.dstg);
+
+ for (dst = list_head(&sn.dstg->dstg_tasks); dst;
+ dst = list_next(&sn.dstg->dstg_tasks, dst)) {
+ objset_t *os = dst->dst_arg1;
+ if (dst->dst_err)
+ dmu_objset_name(os, sn.failed);
+ zil_resume(dmu_objset_zil(os));
+ dmu_objset_close(os);
+ }
+out:
+ if (err)
+ (void) strcpy(fsname, sn.failed);
+ dsl_sync_task_group_destroy(sn.dstg);
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ while (dn = list_head(list)) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(dn->dn_dbuf->db_data_pending);
+ /*
+ * Initialize dn_zio outside dnode_sync()
+ * to accomodate meta-dnode
+ */
+ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+ ASSERT(dn->dn_zio);
+
+ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+ list_remove(list, dn);
+ dnode_sync(dn, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+ blkptr_t *bp = os->os_rootbp;
+ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ int i;
+
+ /*
+ * Update rootbp fill count.
+ */
+ bp->blk_fill = 1; /* count the meta-dnode */
+ for (i = 0; i < dnp->dn_nblkptr; i++)
+ bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+killer(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
+ BP_SET_LEVEL(zio->io_bp, 0);
+
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+ BP_IDENTITY(&zio->io_bp_orig))) {
+ if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, NULL, os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
+ os->os_synctx);
+ }
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+}
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+{
+ int txgoff;
+ zbookmark_t zb;
+ zio_t *zio;
+ list_t *list;
+ dbuf_dirty_record_t *dr;
+ int zio_flags;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* XXX the write_done callback should really give us the tx... */
+ os->os_synctx = tx;
+
+ if (os->os_dsl_dataset == NULL) {
+ /*
+ * This is the MOS. If we have upgraded,
+ * spa_max_replication() could change, so reset
+ * os_copies here.
+ */
+ os->os_copies = spa_max_replication(os->os_spa);
+ }
+
+ /*
+ * Create the root block IO
+ */
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ os->os_rootbp, pio, tx);
+ zio = arc_write(pio, os->os_spa, os->os_md_checksum,
+ os->os_md_compress,
+ dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
+ tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
+ ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+
+ /*
+ * Sync meta-dnode - the parent IO for the sync is the root block
+ */
+ os->os_meta_dnode->dn_zio = zio;
+ dnode_sync(os->os_meta_dnode, tx);
+
+ txgoff = tx->tx_txg & TXG_MASK;
+
+ dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
+ dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+
+ list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ while (dr = list_head(list)) {
+ ASSERT(dr->dr_dbuf->db_level == 0);
+ list_remove(list, dr);
+ if (dr->dr_zio)
+ zio_nowait(dr->dr_zio);
+ }
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ zio_nowait(zio);
+}
+
+void
+dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+ usedobjsp, availobjsp);
+}
+
+uint64_t
+dmu_objset_fsid_guid(objset_t *os)
+{
+ return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+}
+
+void
+dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
+{
+ stat->dds_type = os->os->os_phys->os_type;
+ if (os->os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+}
+
+void
+dmu_objset_stats(objset_t *os, nvlist_t *nv)
+{
+ ASSERT(os->os->os_dsl_dataset ||
+ os->os->os_phys->os_type == DMU_OST_META);
+
+ if (os->os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
+ os->os->os_phys->os_type);
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+ if (os->os->os_dsl_dataset != NULL)
+ return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+ else
+ return (B_FALSE);
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (ENOENT);
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (ENAMETOOLONG);
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ /* there is no next dir on a snapshot! */
+ if (os->os->os_dsl_dataset->ds_object !=
+ dd->dd_phys->dd_head_dataset_obj)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (ENOENT);
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (ENAMETOOLONG);
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ */
+int
+dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+{
+ dsl_dir_t *dd;
+ objset_t *os;
+ uint64_t snapobj;
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ char *child;
+ int do_self, err;
+
+ err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+
+ /* NB: the $MOS dir doesn't have a head dataset */
+ do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+
+ /*
+ * Iterate over all children.
+ */
+ if (flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr.za_integer_length == sizeof (uint64_t));
+ ASSERT(attr.za_num_integers == 1);
+
+ /*
+ * No separating '/' because parent's name ends in /.
+ */
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "/");
+ (void) strcat(child, attr.za_name);
+ err = dmu_objset_find(child, func, arg, flags);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
+
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if ((flags & DS_FIND_SNAPSHOTS) &&
+ dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
+
+ snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
+ dmu_objset_close(os);
+
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr.za_integer_length == sizeof (uint64_t));
+ ASSERT(attr.za_num_integers == 1);
+
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "@");
+ (void) strcat(child, attr.za_name);
+ err = func(child, arg);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ dsl_dir_close(dd, FTAG);
+
+ if (err)
+ return (err);
+
+ /*
+ * Apply to self if appropriate.
+ */
+ if (do_self)
+ err = func(name, arg);
+ return (err);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
new file mode 100644
index 000000000000..46facc3dcbdf
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -0,0 +1,1009 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+struct backuparg {
+ dmu_replay_record_t *drr;
+ kthread_t *td;
+ struct file *fp;
+ objset_t *os;
+ zio_cksum_t zc;
+ int err;
+};
+
+static int
+dump_bytes(struct backuparg *ba, void *buf, int len)
+{
+ struct uio auio;
+ struct iovec aiov;
+
+ ASSERT3U(len % 8, ==, 0);
+
+ fletcher_4_incremental_native(buf, len, &ba->zc);
+
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_offset = (off_t)-1;
+ auio.uio_td = ba->td;
+#ifdef _KERNEL
+ if (ba->fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td);
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ ba->err = EOPNOTSUPP;
+#endif
+
+ return (ba->err);
+}
+
+static int
+dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ /* write a FREE record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_FREE;
+ ba->drr->drr_u.drr_free.drr_object = object;
+ ba->drr->drr_u.drr_free.drr_offset = offset;
+ ba->drr->drr_u.drr_free.drr_length = length;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_data(struct backuparg *ba, dmu_object_type_t type,
+ uint64_t object, uint64_t offset, int blksz, void *data)
+{
+ /* write a DATA record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_WRITE;
+ ba->drr->drr_u.drr_write.drr_object = object;
+ ba->drr->drr_u.drr_write.drr_type = type;
+ ba->drr->drr_u.drr_write.drr_offset = offset;
+ ba->drr->drr_u.drr_write.drr_length = blksz;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ if (dump_bytes(ba, data, blksz))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
+{
+ /* write a FREEOBJECTS record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_FREEOBJECTS;
+ ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
+ ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
+{
+ if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+ return (dump_freeobjects(ba, object, 1));
+
+ /* write an OBJECT record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_OBJECT;
+ ba->drr->drr_u.drr_object.drr_object = object;
+ ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
+ ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
+ ba->drr->drr_u.drr_object.drr_blksz =
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
+ ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
+ ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+
+ if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+ return (EINTR);
+
+ /* free anything past the end of the file */
+ if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+ return (EINTR);
+ if (ba->err)
+ return (EINTR);
+ return (0);
+}
+
+#define BP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+static int
+backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct backuparg *ba = arg;
+ uint64_t object = bc->bc_bookmark.zb_object;
+ int level = bc->bc_bookmark.zb_level;
+ uint64_t blkid = bc->bc_bookmark.zb_blkid;
+ blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
+ dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+ void *data = bc->bc_data;
+ int err = 0;
+
+ if (SIGPENDING(curthread))
+ return (EINTR);
+
+ ASSERT(data || bp == NULL);
+
+ if (bp == NULL && object == 0) {
+ uint64_t span = BP_SPAN(bc->bc_dnode, level);
+ uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
+ err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
+ } else if (bp == NULL) {
+ uint64_t span = BP_SPAN(bc->bc_dnode, level);
+ err = dump_free(ba, object, blkid * span, span);
+ } else if (data && level == 0 && type == DMU_OT_DNODE) {
+ dnode_phys_t *blk = data;
+ int i;
+ int blksz = BP_GET_LSIZE(bp);
+
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj =
+ (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = dump_dnode(ba, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ } else if (level == 0 &&
+ type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
+ int blksz = BP_GET_LSIZE(bp);
+ if (data == NULL) {
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
+ zbookmark_t zb;
+
+ zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
+ zb.zb_object = object;
+ zb.zb_level = level;
+ zb.zb_blkid = blkid;
+ (void) arc_read(NULL, spa, bp,
+ dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+
+ if (abuf) {
+ err = dump_data(ba, type, object, blkid * blksz,
+ blksz, abuf->b_data);
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ }
+ } else {
+ err = dump_data(ba, type, object, blkid * blksz,
+ blksz, data);
+ }
+ }
+
+ ASSERT(err == 0 || err == EINTR);
+ return (err);
+}
+
+int
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
+{
+ dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+ dmu_replay_record_t *drr;
+ struct backuparg ba;
+ int err;
+
+ /* tosnap must be a snapshot */
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return (EINVAL);
+
+ /* fromsnap must be an earlier snapshot from the same fs as tosnap */
+ if (fromds && (ds->ds_dir != fromds->ds_dir ||
+ fromds->ds_phys->ds_creation_txg >=
+ ds->ds_phys->ds_creation_txg))
+ return (EXDEV);
+
+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+ drr->drr_type = DRR_BEGIN;
+ drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+ drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+ drr->drr_u.drr_begin.drr_creation_time =
+ ds->ds_phys->ds_creation_time;
+ drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+ if (fromds)
+ drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
+ dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+
+ ba.drr = drr;
+ ba.td = curthread;
+ ba.fp = fp;
+ ba.os = tosnap;
+ ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
+
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (ba.err);
+ }
+
+ err = traverse_dsl_dataset(ds,
+ fromds ? fromds->ds_phys->ds_creation_txg : 0,
+ ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
+ backup_cb, &ba);
+
+ if (err) {
+ if (err == EINTR && ba.err)
+ err = ba.err;
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (err);
+ }
+
+ bzero(drr, sizeof (dmu_replay_record_t));
+ drr->drr_type = DRR_END;
+ drr->drr_u.drr_end.drr_checksum = ba.zc;
+
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (ba.err);
+ }
+
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+
+ return (0);
+}
+
+struct restorearg {
+ int err;
+ int byteswap;
+ kthread_t *td;
+ struct file *fp;
+ char *buf;
+ uint64_t voff;
+ int buflen; /* number of valid bytes in buf */
+ int bufoff; /* next offset to read */
+ int bufsize; /* amount of memory allocated for buf */
+ zio_cksum_t zc;
+};
+
+/* ARGSUSED */
+static int
+replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct drr_begin *drrb = arg2;
+ const char *snapname;
+ int err;
+ uint64_t val;
+
+ /* must already be a snapshot of this fs */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ return (ENODEV);
+
+ /* most recent snapshot must match fromguid */
+ if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
+ return (ENODEV);
+ /* must not have any changes since most recent snapshot */
+ if (ds->ds_phys->ds_bp.blk_birth >
+ ds->ds_prev->ds_phys->ds_creation_txg)
+ return (ETXTBSY);
+
+ /* new snapshot name must not exist */
+ snapname = strrchr(drrb->drr_toname, '@');
+ if (snapname == NULL)
+ return (EEXIST);
+
+ snapname++;
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+}
+
+/* ARGSUSED */
+static int
+replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct drr_begin *drrb = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ char *cp;
+ uint64_t val;
+ int err;
+
+ cp = strchr(drrb->drr_toname, '@');
+ *cp = '\0';
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ strrchr(drrb->drr_toname, '/') + 1,
+ sizeof (uint64_t), 1, &val);
+ *cp = '@';
+
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
+
+ return (0);
+}
+
+static void
+replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct drr_begin *drrb = arg2;
+ char *cp;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ cp = strchr(drrb->drr_toname, '@');
+ *cp = '\0';
+ dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
+ NULL, tx);
+ *cp = '@';
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+ DS_MODE_EXCLUSIVE, FTAG, &ds));
+
+ (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+}
+
+static int
+replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ struct drr_begin *drrb = arg2;
+ char *snapname;
+
+ /* XXX verify that drr_toname is in dd */
+
+ snapname = strchr(drrb->drr_toname, '@');
+ if (snapname == NULL)
+ return (EINVAL);
+ snapname++;
+
+ return (dsl_dataset_snapshot_check(os, snapname, tx));
+}
+
+static void
+replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ struct drr_begin *drrb = arg2;
+ char *snapname;
+ dsl_dataset_t *ds, *hds;
+
+ snapname = strchr(drrb->drr_toname, '@') + 1;
+
+ dsl_dataset_snapshot_sync(os, snapname, tx);
+
+ /* set snapshot's creation time and guid */
+ hds = os->os->os_dsl_dataset;
+ VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
+ hds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ FTAG, &ds));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
+ ds->ds_phys->ds_guid = drrb->drr_toguid;
+ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+ dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+
+ dmu_buf_will_dirty(hds->ds_dbuf, tx);
+ hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+}
+
+static int
+restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = off;
+ auio.uio_td = ra->td;
+#ifdef _KERNEL
+ error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ error = EOPNOTSUPP;
+#endif
+ *resid = auio.uio_resid;
+ return (error);
+}
+
+static void *
+restore_read(struct restorearg *ra, int len)
+{
+ void *rv;
+
+ /* some things will require 8-byte alignment, so everything must */
+ ASSERT3U(len % 8, ==, 0);
+
+ while (ra->buflen - ra->bufoff < len) {
+ int resid;
+ int leftover = ra->buflen - ra->bufoff;
+
+ (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+
+ ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
+ ra->bufsize - leftover, ra->voff, &resid);
+
+ ra->voff += ra->bufsize - leftover - resid;
+ ra->buflen = ra->bufsize - resid;
+ ra->bufoff = 0;
+ if (resid == ra->bufsize - leftover)
+ ra->err = EINVAL;
+ if (ra->err)
+ return (NULL);
+ /* Could compute checksum here? */
+ }
+
+ ASSERT3U(ra->bufoff % 8, ==, 0);
+ ASSERT3U(ra->buflen - ra->bufoff, >=, len);
+ rv = ra->buf + ra->bufoff;
+ ra->bufoff += len;
+ if (ra->byteswap)
+ fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+ else
+ fletcher_4_incremental_native(rv, len, &ra->zc);
+ return (rv);
+}
+
+static void
+backup_byteswap(dmu_replay_record_t *drr)
+{
+#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ DO64(drr_begin.drr_magic);
+ DO64(drr_begin.drr_version);
+ DO64(drr_begin.drr_creation_time);
+ DO32(drr_begin.drr_type);
+ DO64(drr_begin.drr_toguid);
+ DO64(drr_begin.drr_fromguid);
+ break;
+ case DRR_OBJECT:
+ DO64(drr_object.drr_object);
+ /* DO64(drr_object.drr_allocation_txg); */
+ DO32(drr_object.drr_type);
+ DO32(drr_object.drr_bonustype);
+ DO32(drr_object.drr_blksz);
+ DO32(drr_object.drr_bonuslen);
+ break;
+ case DRR_FREEOBJECTS:
+ DO64(drr_freeobjects.drr_firstobj);
+ DO64(drr_freeobjects.drr_numobjs);
+ break;
+ case DRR_WRITE:
+ DO64(drr_write.drr_object);
+ DO32(drr_write.drr_type);
+ DO64(drr_write.drr_offset);
+ DO64(drr_write.drr_length);
+ break;
+ case DRR_FREE:
+ DO64(drr_free.drr_object);
+ DO64(drr_free.drr_offset);
+ DO64(drr_free.drr_length);
+ break;
+ case DRR_END:
+ DO64(drr_end.drr_checksum.zc_word[0]);
+ DO64(drr_end.drr_checksum.zc_word[1]);
+ DO64(drr_end.drr_checksum.zc_word[2]);
+ DO64(drr_end.drr_checksum.zc_word[3]);
+ break;
+ }
+#undef DO64
+#undef DO32
+}
+
+static int
+restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+{
+ int err;
+ dmu_tx_t *tx;
+
+ err = dmu_object_info(os, drro->drr_object, NULL);
+
+ if (err != 0 && err != ENOENT)
+ return (EINVAL);
+
+ if (drro->drr_type == DMU_OT_NONE ||
+ drro->drr_type >= DMU_OT_NUMTYPES ||
+ drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+ drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+ P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+ drro->drr_blksz < SPA_MINBLOCKSIZE ||
+ drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ return (EINVAL);
+ }
+
+ tx = dmu_tx_create(os);
+
+ if (err == ENOENT) {
+ /* currently free, want to be allocated */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_object_claim(os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
+ } else {
+ /* currently allocated, want to be allocated */
+ dmu_tx_hold_bonus(tx, drro->drr_object);
+ /*
+ * We may change blocksize, so need to
+ * hold_write
+ */
+ dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ err = dmu_object_reclaim(os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
+ }
+ if (err) {
+ dmu_tx_commit(tx);
+ return (EINVAL);
+ }
+
+ dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+ dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+
+ if (drro->drr_bonuslen) {
+ dmu_buf_t *db;
+ void *data;
+ VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ if (data == NULL) {
+ dmu_tx_commit(tx);
+ return (ra->err);
+ }
+ bcopy(data, db->db_data, db->db_size);
+ if (ra->byteswap) {
+ dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+ drro->drr_bonuslen);
+ }
+ dmu_buf_rele(db, FTAG);
+ }
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_freeobjects(struct restorearg *ra, objset_t *os,
+ struct drr_freeobjects *drrfo)
+{
+ uint64_t obj;
+
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+ return (EINVAL);
+
+ for (obj = drrfo->drr_firstobj;
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
+ (void) dmu_object_next(os, &obj, FALSE, 0)) {
+ dmu_tx_t *tx;
+ int err;
+
+ if (dmu_object_info(os, obj, NULL) != 0)
+ continue;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, obj);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_object_free(os, obj, tx);
+ dmu_tx_commit(tx);
+ if (err && err != ENOENT)
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+restore_write(struct restorearg *ra, objset_t *os,
+ struct drr_write *drrw)
+{
+ dmu_tx_t *tx;
+ void *data;
+ int err;
+
+ if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+ drrw->drr_type >= DMU_OT_NUMTYPES)
+ return (EINVAL);
+
+ data = restore_read(ra, drrw->drr_length);
+ if (data == NULL)
+ return (ra->err);
+
+ if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ if (ra->byteswap)
+ dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+ dmu_write(os, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_length, data, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_free(struct restorearg *ra, objset_t *os,
+ struct drr_free *drrf)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ if (drrf->drr_length != -1ULL &&
+ drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+ return (EINVAL);
+
+ if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_free_range(os, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length, tx);
+ dmu_tx_commit(tx);
+ return (err);
+}
+
+int
+dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
+ boolean_t force, struct file *fp, uint64_t voffset)
+{
+ kthread_t *td = curthread;
+ struct restorearg ra;
+ dmu_replay_record_t *drr;
+ char *cp;
+ objset_t *os = NULL;
+ zio_cksum_t pzc;
+
+ bzero(&ra, sizeof (ra));
+ ra.td = td;
+ ra.fp = fp;
+ ra.voff = voffset;
+ ra.bufsize = 1<<20;
+ ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+
+ if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ ra.byteswap = FALSE;
+ } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ ra.byteswap = TRUE;
+ } else {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ /*
+ * NB: this assumes that struct drr_begin will be the largest in
+ * dmu_replay_record_t's drr_u, and thus we don't need to pad it
+ * with zeros to make it the same length as we wrote out.
+ */
+ ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
+ ((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
+ ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
+ if (ra.byteswap) {
+ fletcher_4_incremental_byteswap(ra.buf,
+ sizeof (dmu_replay_record_t), &ra.zc);
+ } else {
+ fletcher_4_incremental_native(ra.buf,
+ sizeof (dmu_replay_record_t), &ra.zc);
+ }
+ (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
+
+ if (ra.byteswap) {
+ drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+ drrb->drr_version = BSWAP_64(drrb->drr_version);
+ drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+ drrb->drr_type = BSWAP_32(drrb->drr_type);
+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+ drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+ }
+
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+ if (drrb->drr_version != DMU_BACKUP_VERSION ||
+ drrb->drr_type >= DMU_OST_NUMTYPES ||
+ strchr(drrb->drr_toname, '@') == NULL) {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Process the begin in syncing context.
+ */
+ if (drrb->drr_fromguid) {
+ /* incremental backup */
+ dsl_dataset_t *ds = NULL;
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
+ *cp = '@';
+ if (ra.err)
+ goto out;
+
+ /*
+ * Only do the rollback if the most recent snapshot
+ * matches the incremental source
+ */
+ if (force) {
+ if (ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_guid !=
+ drrb->drr_fromguid) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ kmem_free(ra.buf, ra.bufsize);
+ return (ENODEV);
+ }
+ (void) dsl_dataset_rollback(ds);
+ }
+ ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ replay_incremental_check, replay_incremental_sync,
+ ds, drrb, 1);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ } else {
+ /* full backup */
+ dsl_dir_t *dd = NULL;
+ const char *tail;
+
+ /* can't restore full backup into topmost fs, for now */
+ if (strrchr(drrb->drr_toname, '/') == NULL) {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
+ *cp = '@';
+ if (ra.err)
+ goto out;
+ if (tail == NULL) {
+ ra.err = EEXIST;
+ goto out;
+ }
+
+ ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
+ replay_full_sync, dd, drrb, 5);
+ dsl_dir_close(dd, FTAG);
+ }
+ if (ra.err)
+ goto out;
+
+ /*
+ * Open the objset we are modifying.
+ */
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
+ DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
+ *cp = '@';
+ ASSERT3U(ra.err, ==, 0);
+
+ /*
+ * Read records and process them.
+ */
+ pzc = ra.zc;
+ while (ra.err == 0 &&
+ NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+ if (SIGPENDING(td)) {
+ ra.err = EINTR;
+ goto out;
+ }
+
+ if (ra.byteswap)
+ backup_byteswap(drr);
+
+ switch (drr->drr_type) {
+ case DRR_OBJECT:
+ {
+ /*
+ * We need to make a copy of the record header,
+ * because restore_{object,write} may need to
+ * restore_read(), which will invalidate drr.
+ */
+ struct drr_object drro = drr->drr_u.drr_object;
+ ra.err = restore_object(&ra, os, &drro);
+ break;
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects drrfo =
+ drr->drr_u.drr_freeobjects;
+ ra.err = restore_freeobjects(&ra, os, &drrfo);
+ break;
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write drrw = drr->drr_u.drr_write;
+ ra.err = restore_write(&ra, os, &drrw);
+ break;
+ }
+ case DRR_FREE:
+ {
+ struct drr_free drrf = drr->drr_u.drr_free;
+ ra.err = restore_free(&ra, os, &drrf);
+ break;
+ }
+ case DRR_END:
+ {
+ struct drr_end drre = drr->drr_u.drr_end;
+ /*
+ * We compare against the *previous* checksum
+ * value, because the stored checksum is of
+ * everything before the DRR_END record.
+ */
+ if (drre.drr_checksum.zc_word[0] != 0 &&
+ !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
+ ra.err = ECKSUM;
+ goto out;
+ }
+
+ ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
+ ds_dir->dd_pool, replay_end_check, replay_end_sync,
+ os, drrb, 3);
+ goto out;
+ }
+ default:
+ ra.err = EINVAL;
+ goto out;
+ }
+ pzc = ra.zc;
+ }
+
+out:
+ if (os)
+ dmu_objset_close(os);
+
+ /*
+ * Make sure we don't rollback/destroy unless we actually
+ * processed the begin properly. 'os' will only be set if this
+ * is the case.
+ */
+ if (ra.err && os && tosnap && strchr(tosnap, '@')) {
+ /*
+ * rollback or destroy what we created, so we don't
+ * leave it in the restoring state.
+ */
+ dsl_dataset_t *ds;
+ int err;
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ err = dsl_dataset_open(tosnap,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
+ FTAG, &ds);
+ if (err == 0) {
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+ if (drrb->drr_fromguid) {
+ /* incremental: rollback to most recent snap */
+ (void) dsl_dataset_rollback(ds);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ } else {
+ /* full: destroy whole fs */
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ (void) dsl_dataset_destroy(tosnap);
+ }
+ }
+ *cp = '@';
+ }
+
+ kmem_free(ra.buf, ra.bufsize);
+ if (sizep)
+ *sizep = ra.voff;
+ return (ra.err);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
new file mode 100644
index 000000000000..3d2bc3e47678
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -0,0 +1,888 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+
+#define BP_SPAN_SHIFT(level, width) ((level) * (width))
+
+#define BP_EQUAL(b1, b2) \
+ (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
+ (b1)->blk_birth == (b2)->blk_birth)
+
+/*
+ * Compare two bookmarks.
+ *
+ * For ADVANCE_PRE, the visitation order is:
+ *
+ * objset 0, 1, 2, ..., ZB_MAXOBJSET.
+ * object 0, 1, 2, ..., ZB_MAXOBJECT.
+ * blkoff 0, 1, 2, ...
+ * level ZB_MAXLEVEL, ..., 2, 1, 0.
+ *
+ * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
+ * ordering vector is:
+ *
+ * < objset, object, blkoff, -level >
+ *
+ * For ADVANCE_POST, the starting offsets aren't sequential but ending
+ * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
+ * The visitation order is:
+ *
+ * objset 1, 2, ..., ZB_MAXOBJSET, 0.
+ * object 1, 2, ..., ZB_MAXOBJECT, 0.
+ * blkoff 1, 2, ...
+ * level 0, 1, 2, ..., ZB_MAXLEVEL.
+ *
+ * and thus a valid ordering vector is:
+ *
+ * < objset - 1, object - 1, blkoff, level >
+ *
+ * Both orderings can be expressed as:
+ *
+ * < objset + bias, object + bias, blkoff, level ^ bias >
+ *
+ * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
+ * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
+ *
+ * Special case: an objset's osphys is represented as level -1 of object 0.
+ * It is always either the very first or very last block we visit in an objset.
+ * Therefore, if either bookmark's level is -1, level alone determines order.
+ */
+static int
+compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
+ int advance)
+{
+ int bias = (advance & ADVANCE_PRE) ? 0 : -1;
+ uint64_t sblkoff, eblkoff;
+ int slevel, elevel, wshift;
+
+ if (szb->zb_objset + bias < ezb->zb_objset + bias)
+ return (-1);
+
+ if (szb->zb_objset + bias > ezb->zb_objset + bias)
+ return (1);
+
+ slevel = szb->zb_level;
+ elevel = ezb->zb_level;
+
+ if ((slevel | elevel) < 0)
+ return ((slevel ^ bias) - (elevel ^ bias));
+
+ if (szb->zb_object + bias < ezb->zb_object + bias)
+ return (-1);
+
+ if (szb->zb_object + bias > ezb->zb_object + bias)
+ return (1);
+
+ if (dnp == NULL)
+ return (0);
+
+ wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
+ eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
+
+ if (sblkoff < eblkoff)
+ return (-1);
+
+ if (sblkoff > eblkoff)
+ return (1);
+
+ return ((elevel ^ bias) - (slevel ^ bias));
+}
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define SET_BOOKMARK_LB(zb, level, blkid) \
+{ \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+static int
+advance_objset(zseg_t *zseg, uint64_t objset, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (objset >= ZB_MAXOBJSET)
+ return (ERANGE);
+ SET_BOOKMARK(zb, objset, 0, -1, 0);
+ } else {
+ if (objset >= ZB_MAXOBJSET)
+ objset = 0;
+ SET_BOOKMARK(zb, objset, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_object(zseg_t *zseg, uint64_t object, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (object >= ZB_MAXOBJECT) {
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
+ } else {
+ SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
+ }
+ } else {
+ if (zb->zb_object == 0) {
+ SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
+ } else {
+ if (object >= ZB_MAXOBJECT)
+ object = 0;
+ SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
+ }
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_from_osphys(zseg_t *zseg, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_level == -1);
+ ASSERT(zb->zb_blkid == 0);
+
+ if (advance & ADVANCE_PRE) {
+ SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
+ } else {
+ if (zb->zb_objset == 0)
+ return (ERANGE);
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int level = zb->zb_level;
+ uint64_t blkid = zb->zb_blkid;
+
+ if (advance & ADVANCE_PRE) {
+ if (level > 0 && rc == 0) {
+ level--;
+ blkid <<= wshift;
+ } else {
+ blkid++;
+
+ if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid)
+ return (ERANGE);
+
+ while (level < maxlevel) {
+ if (P2PHASE(blkid, 1ULL << wshift))
+ break;
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ } else {
+ if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
+ blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
+ level = 0;
+ } else {
+ blkid >>= wshift;
+ level++;
+ }
+
+ while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid) {
+ if (level == maxlevel)
+ return (ERANGE);
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ SET_BOOKMARK_LB(zb, level, blkid);
+
+ if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
+{
+ /*
+ * Before we issue the callback, prune against maxtxg.
+ *
+ * We prune against mintxg before we get here because it's a big win.
+ * If a given block was born in txg 37, then we know that the entire
+ * subtree below that block must have been born in txg 37 or earlier.
+ * We can therefore lop off huge branches of the tree as we go.
+ *
+ * There's no corresponding optimization for maxtxg because knowing
+ * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
+ * children. In fact, the copy-on-write design of ZFS ensures that
+ * top-level blocks will pretty much always be new.
+ *
+ * Therefore, in the name of simplicity we don't prune against
+ * maxtxg until the last possible moment -- that being right now.
+ */
+ if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
+ return (0);
+
+ /*
+ * Debugging: verify that the order we visit things agrees with the
+ * order defined by compare_bookmark(). We don't check this for
+ * log blocks because there's no defined ordering for them; they're
+ * always visited (or not) as part of visiting the objset_phys_t.
+ */
+ if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zbookmark_t *szb = &zseg->seg_start;
+ zbookmark_t *ezb = &zseg->seg_end;
+ zbookmark_t *lzb = &th->th_lastcb;
+ dnode_phys_t *dnp = bc->bc_dnode;
+
+ ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
+ ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
+ ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
+ lzb->zb_level == ZB_NO_LEVEL);
+ *lzb = *zb;
+ }
+
+ th->th_callbacks++;
+ return (th->th_func(bc, th->th_spa, th->th_arg));
+}
+
+static int
+traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
+ dnode_phys_t *dnp)
+{
+ zbookmark_t *zb = &bc->bc_bookmark;
+ int error;
+
+ th->th_hits++;
+
+ bc->bc_dnode = dnp;
+ bc->bc_errno = 0;
+
+ if (BP_EQUAL(&bc->bc_blkptr, bp))
+ return (0);
+
+ bc->bc_blkptr = *bp;
+
+ if (bc->bc_data == NULL)
+ return (0);
+
+ if (BP_IS_HOLE(bp)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ return (0);
+ }
+
+ if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
+ error = EIO;
+ } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
+ error = 0;
+ th->th_arc_hits++;
+ } else {
+ error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
+ BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
+
+ if (BP_SHOULD_BYTESWAP(bp) && error == 0)
+ (zb->zb_level > 0 ? byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
+ BP_GET_LSIZE(bp));
+ th->th_reads++;
+ }
+
+ if (error) {
+ bc->bc_errno = error;
+ error = traverse_callback(th, NULL, bc);
+ ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
+ bc->bc_blkptr.blk_birth = -1ULL;
+ }
+
+ dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
+ bc - &th->th_cache[0][0], error,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ return (error);
+}
+
+static int
+find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ blkptr_t *bp = dnp->dn_blkptr;
+ int i, first, level;
+ int nbp = dnp->dn_nblkptr;
+ int minlevel = zb->zb_level;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
+ uint64_t blkid = zb->zb_blkid >> bp_shift;
+ int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
+ int rc;
+
+ if (minlevel > maxlevel || blkid >= nbp)
+ return (ERANGE);
+
+ for (level = maxlevel; level >= minlevel; level--) {
+ first = P2PHASE(blkid, 1ULL << wshift);
+
+ for (i = first; i < nbp; i++)
+ if (bp[i].blk_birth > zseg->seg_mintxg ||
+ BP_IS_HOLE(&bp[i]) && do_holes)
+ break;
+
+ if (i != first) {
+ i--;
+ SET_BOOKMARK_LB(zb, level, blkid + (i - first));
+ return (ENOTBLK);
+ }
+
+ bc = &th->th_cache[depth][level];
+
+ SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
+ level, blkid);
+
+ if (rc = traverse_read(th, bc, bp + i, dnp)) {
+ if (rc != EAGAIN) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ }
+ return (rc);
+ }
+
+ if (BP_IS_HOLE(&bp[i])) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ return (0);
+ }
+
+ nbp = 1 << wshift;
+ bp = bc->bc_data;
+ bp_shift -= wshift;
+ blkid = zb->zb_blkid >> bp_shift;
+ }
+
+ return (0);
+}
+
+static int
+get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
+ uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
+{
+ zseg_t zseg;
+ zbookmark_t *zb = &zseg.seg_start;
+ uint64_t object = *objectp;
+ int i, rc;
+
+ SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
+ SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
+
+ zseg.seg_mintxg = txg;
+ zseg.seg_maxtxg = -1ULL;
+
+ for (;;) {
+ rc = find_block(th, &zseg, mdn, depth);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0 && zb->zb_level == 0) {
+ dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
+ for (i = 0; i < DNODES_PER_BLOCK; i++) {
+ object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
+ if (object >= *objectp &&
+ dnp[i].dn_type != DMU_OT_NONE &&
+ (type == -1 || dnp[i].dn_type == type)) {
+ *objectp = object;
+ *dnpp = &dnp[i];
+ return (0);
+ }
+ }
+ }
+
+ rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
+
+ if (rc == ERANGE)
+ break;
+ }
+
+ if (rc == ERANGE)
+ *objectp = ZB_MAXOBJECT;
+
+ return (rc);
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zseg_t *zseg = list_head(&th->th_seglist);
+
+ if (bp->blk_birth <= zseg->seg_mintxg)
+ return;
+
+ if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
+ zb->zb_object = 0;
+ zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ bc->bc_blkptr = *bp;
+ (void) traverse_callback(th, zseg, bc);
+ }
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zseg_t *zseg = list_head(&th->th_seglist);
+
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ if (bp->blk_birth <= zseg->seg_mintxg)
+ return;
+
+ if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
+ zb->zb_object = lr->lr_foid;
+ zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ bc->bc_blkptr = *bp;
+ (void) traverse_callback(th, zseg, bc);
+ }
+ }
+}
+
+static void
+traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
+{
+ spa_t *spa = th->th_spa;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_phys_t *osphys = bc->bc_data;
+ zil_header_t *zh = &osphys->os_zil_header;
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zilog_t *zilog;
+
+ ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
+ ASSERT(bc->bc_bookmark.zb_level == -1);
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && (spa_mode & FWRITE))
+ return;
+
+ th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+static int
+traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ dnode_phys_t *dn, *dn_tmp;
+ int worklimit = 100;
+ int rc;
+
+ dprintf("<%llu, %llu, %d, %llx>\n",
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
+
+ rc = traverse_read(th, bc, mosbp, dn);
+
+ if (rc) /* If we get ERESTART, we've got nowhere left to go */
+ return (rc == ERESTART ? EINTR : rc);
+
+ ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
+
+ if (zb->zb_objset != 0) {
+ uint64_t objset = zb->zb_objset;
+ dsl_dataset_phys_t *dsp;
+
+ rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
+ DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
+
+ if (objset != zb->zb_objset)
+ rc = advance_objset(zseg, objset, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dsp = DN_BONUS(dn_tmp);
+
+ bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
+
+ /*
+ * If we're traversing an open snapshot, we know that it
+ * can't be deleted (because it's open) and it can't change
+ * (because it's a snapshot). Therefore, once we've gotten
+ * from the uberblock down to the snapshot's objset_phys_t,
+ * we no longer need to synchronize with spa_sync(); we're
+ * traversing a completely static block tree from here on.
+ */
+ if (th->th_advance & ADVANCE_NOLOCK) {
+ ASSERT(th->th_locked);
+ rw_exit(spa_traverse_rwlock(th->th_spa));
+ th->th_locked = 0;
+ }
+
+ rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+
+ if (rc != 0) {
+ if (rc == ERESTART)
+ rc = advance_objset(zseg, zb->zb_objset + 1,
+ th->th_advance);
+ return (rc);
+ }
+
+ if (th->th_advance & ADVANCE_PRUNE)
+ zseg->seg_mintxg =
+ MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+ }
+
+ if (zb->zb_level == -1) {
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_blkid == 0);
+ ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
+
+ if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if ((th->th_advance & ADVANCE_ZIL) &&
+ zb->zb_objset != 0)
+ traverse_zil(th, bc);
+ }
+
+ return (advance_from_osphys(zseg, th->th_advance));
+ }
+
+ if (zb->zb_object != 0) {
+ uint64_t object = zb->zb_object;
+
+ rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
+ zseg->seg_mintxg, -1, ZB_MDN_CACHE);
+
+ if (object != zb->zb_object)
+ rc = advance_object(zseg, object, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dn = dn_tmp;
+ }
+
+ if (zb->zb_level == ZB_MAXLEVEL)
+ zb->zb_level = dn->dn_nlevels - 1;
+
+ for (;;) {
+ rc = find_block(th, zseg, dn, ZB_DN_CACHE);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0) {
+ bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
+ ASSERT(bc->bc_dnode == dn);
+ ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if (BP_IS_HOLE(&bc->bc_blkptr)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ rc = ENOTBLK;
+ }
+ }
+
+ rc = advance_block(zseg, dn, rc, th->th_advance);
+
+ if (rc == ERANGE)
+ break;
+
+ /*
+ * Give spa_sync() a chance to run.
+ */
+ if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
+ th->th_syncs++;
+ return (EAGAIN);
+ }
+
+ if (--worklimit == 0)
+ return (EAGAIN);
+ }
+
+ if (rc == ERANGE)
+ rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
+
+ return (rc);
+}
+
+/*
+ * It is the caller's responsibility to ensure that the dsl_dataset_t
+ * doesn't go away during traversal.
+ */
+int
+traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
+ blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+
+ traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
+traverse_more(traverse_handle_t *th)
+{
+ zseg_t *zseg = list_head(&th->th_seglist);
+ uint64_t save_txg; /* XXX won't be necessary with real itinerary */
+ krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
+ blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
+ int rc;
+
+ if (zseg == NULL)
+ return (0);
+
+ th->th_restarts++;
+
+ save_txg = zseg->seg_mintxg;
+
+ rw_enter(rw, RW_READER);
+ th->th_locked = 1;
+
+ rc = traverse_segment(th, zseg, mosbp);
+ ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+
+ if (th->th_locked)
+ rw_exit(rw);
+ th->th_locked = 0;
+
+ zseg->seg_mintxg = save_txg;
+
+ if (rc == ERANGE) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ return (EAGAIN);
+ }
+
+ return (rc);
+}
+
+/*
+ * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
+ * are not included. The blocks covered by this segment will all have
+ * mintxg < birth < maxtxg.
+ */
+static void
+traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
+ uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+{
+ zseg_t *zseg;
+
+ zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+
+ zseg->seg_mintxg = mintxg;
+ zseg->seg_maxtxg = maxtxg;
+
+ zseg->seg_start.zb_objset = sobjset;
+ zseg->seg_start.zb_object = sobject;
+ zseg->seg_start.zb_level = slevel;
+ zseg->seg_start.zb_blkid = sblkid;
+
+ zseg->seg_end.zb_objset = eobjset;
+ zseg->seg_end.zb_object = eobject;
+ zseg->seg_end.zb_level = elevel;
+ zseg->seg_end.zb_blkid = eblkid;
+
+ list_insert_tail(&th->th_seglist, zseg);
+}
+
+void
+traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset, uint64_t object)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, ZB_MAXLEVEL, 0,
+ objset, object, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, 0, 0,
+ objset, object, 0, ZB_MAXBLKID);
+}
+
+void
+traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 0, -1, 0,
+ objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 1, 0, 0,
+ objset, 0, -1, 0);
+}
+
+void
+traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ 0, 0, -1, 0,
+ ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ 1, 1, 0, 0,
+ 0, 0, -1, 0);
+}
+
+traverse_handle_t *
+traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
+ int zio_flags)
+{
+ traverse_handle_t *th;
+ int d, l;
+
+ th = kmem_zalloc(sizeof (*th), KM_SLEEP);
+
+ th->th_spa = spa;
+ th->th_func = func;
+ th->th_arg = arg;
+ th->th_advance = advance;
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ th->th_noread.zb_level = ZB_NO_LEVEL;
+ th->th_zio_flags = zio_flags;
+
+ list_create(&th->th_seglist, sizeof (zseg_t),
+ offsetof(zseg_t, seg_node));
+
+ for (d = 0; d < ZB_DEPTH; d++) {
+ for (l = 0; l < ZB_MAXLEVEL; l++) {
+ if ((advance & ADVANCE_DATA) ||
+ l != 0 || d != ZB_DN_CACHE)
+ th->th_cache[d][l].bc_data =
+ zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ }
+ }
+
+ return (th);
+}
+
+void
+traverse_fini(traverse_handle_t *th)
+{
+ int d, l;
+ zseg_t *zseg;
+
+ for (d = 0; d < ZB_DEPTH; d++)
+ for (l = 0; l < ZB_MAXLEVEL; l++)
+ if (th->th_cache[d][l].bc_data != NULL)
+ zio_buf_free(th->th_cache[d][l].bc_data,
+ SPA_MAXBLOCKSIZE);
+
+ while ((zseg = list_head(&th->th_seglist)) != NULL) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ }
+
+ list_destroy(&th->th_seglist);
+
+ dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
+ th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
+ th->th_syncs, th->th_restarts);
+
+ kmem_free(th, sizeof (*th));
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
new file mode 100644
index 000000000000..13fd8d4d9dce
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -0,0 +1,992 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h> /* for fzap_default_block_shift */
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+ tx->tx_dir = dd;
+ if (dd)
+ tx->tx_pool = dd->dd_pool;
+ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+ offsetof(dmu_tx_hold_t, txh_node));
+#ifdef ZFS_DEBUG
+ refcount_create(&tx->tx_space_written);
+ refcount_create(&tx->tx_space_freed);
+#endif
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+ tx->tx_objset = os;
+ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ tx->tx_pool = dp;
+ tx->tx_txg = txg;
+ tx->tx_anyobj = TRUE;
+
+ return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+ enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn = NULL;
+ int err;
+
+ if (object != DMU_NEW_OBJECT) {
+ err = dnode_hold(os->os, object, tx, &dn);
+ if (err) {
+ tx->tx_err = err;
+ return (NULL);
+ }
+
+ if (err == 0 && tx->tx_txg != 0) {
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+ * problem, but there's no way for it to happen (for
+ * now, at least).
+ */
+ ASSERT(dn->dn_assigned_txg == 0);
+ dn->dn_assigned_txg = tx->tx_txg;
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ }
+
+ txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+ txh->txh_tx = tx;
+ txh->txh_dnode = dn;
+#ifdef ZFS_DEBUG
+ txh->txh_type = type;
+ txh->txh_arg1 = arg1;
+ txh->txh_arg2 = arg2;
+#endif
+ list_insert_tail(&tx->tx_holds, txh);
+
+ return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+ /*
+ * If we're syncing, they can manipulate any object anyhow, and
+ * the hold on the dnode_t can cause problems.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ (void) dmu_tx_hold_object_impl(tx, os,
+ object, THT_NEWOBJECT, 0, 0);
+ }
+}
+
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+ int err;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, level, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, zio, DB_RF_CANFAIL);
+ dbuf_rele(db, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ uint64_t start, end, i;
+ int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ min_bs = SPA_MINBLOCKSHIFT;
+ max_bs = SPA_MAXBLOCKSHIFT;
+ min_ibs = DN_MIN_INDBLKSHIFT;
+ max_ibs = DN_MAX_INDBLKSHIFT;
+
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks (if they are not aligned), and all the level-1 blocks.
+ */
+
+ if (dn) {
+ if (dn->dn_maxblkid == 0) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err)
+ goto out;
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) ||
+ len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err)
+ goto out;
+ }
+
+ /* last level-0 block */
+ end = (off+len-1) >> dn->dn_datablkshift;
+ if (end != start &&
+ P2PHASE(off+len, dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, end);
+ if (err)
+ goto out;
+ }
+
+ /* level-1 blocks */
+ if (dn->dn_nlevels > 1) {
+ start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (i = start+1; i < end; i++) {
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err)
+ goto out;
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If there's more than one block, the blocksize can't change,
+ * so we can make a more precise estimate. Alternatively,
+ * if the dnode's ibs is larger than max_ibs, always use that.
+ * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+ * the code will still work correctly on existing pools.
+ */
+ if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
+ min_ibs = max_ibs = dn->dn_indblkshift;
+ if (dn->dn_datablkshift != 0)
+ min_bs = max_bs = dn->dn_datablkshift;
+ }
+
+ /*
+ * 'end' is the last thing we will access, not one past.
+ * This way we won't overflow when accessing the last byte.
+ */
+ start = P2ALIGN(off, 1ULL << max_bs);
+ end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+ txh->txh_space_towrite += end - start + 1;
+
+ start >>= min_bs;
+ end >>= min_bs;
+
+ epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+ /*
+ * The object contains at most 2^(64 - min_bs) blocks,
+ * and each indirect level maps 2^epbs.
+ */
+ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+ start >>= epbs;
+ end >>= epbs;
+ /*
+ * If we increase the number of levels of indirection,
+ * we'll need new blkid=0 indirect blocks. If start == 0,
+ * we're already accounting for that blocks; and if end == 0,
+ * we can't increase the number of levels beyond that.
+ */
+ if (start != 0 && end != 0)
+ txh->txh_space_towrite += 1ULL << max_ibs;
+ txh->txh_space_towrite += (end - start + 1) << max_ibs;
+ }
+
+ ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
+
+out:
+ if (err)
+ txh->txh_tx->tx_err = err;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+ dnode_t *dn = txh->txh_dnode;
+ dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+ uint64_t space = mdn->dn_datablksz +
+ ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
+
+ if (dn && dn->dn_dbuf->db_blkptr &&
+ dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_dbuf->db_blkptr->blk_birth)) {
+ txh->txh_space_tooverwrite += space;
+ } else {
+ txh->txh_space_towrite += space;
+ }
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(len < DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, off, len);
+ if (txh == NULL)
+ return;
+
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ uint64_t blkid, nblks;
+ uint64_t space = 0;
+ dnode_t *dn = txh->txh_dnode;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
+ int dirty;
+
+ /*
+ * We don't need to use any locking to check for dirtyness
+ * because it's OK if we get stale data -- the dnode may become
+ * dirty immediately after our check anyway. This is just a
+ * means to avoid the expensive count when we aren't sure we
+ * need it. We need to be able to deal with a dirty dnode.
+ */
+ dirty = list_link_active(&dn->dn_dirty_link[0]) |
+ list_link_active(&dn->dn_dirty_link[1]) |
+ list_link_active(&dn->dn_dirty_link[2]) |
+ list_link_active(&dn->dn_dirty_link[3]);
+ if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+ return;
+
+ /*
+ * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * changing, in case (against all odds) we manage to dirty &
+ * sync out the changes after we check for being dirty.
+ * also, dbuf_hold_impl() wants us to have the struct_rwlock.
+ *
+ * It's fine to use dn_datablkshift rather than the dn_phys
+ * equivalent because if it is changing, maxblkid==0 and we will
+ * bail.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_phys->dn_maxblkid == 0) {
+ if (off == 0 && len >= dn->dn_datablksz) {
+ blkid = 0;
+ nblks = 1;
+ } else {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ } else {
+ blkid = off >> dn->dn_datablkshift;
+ nblks = (off + len) >> dn->dn_datablkshift;
+
+ if (blkid >= dn->dn_phys->dn_maxblkid) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid)
+ nblks = dn->dn_phys->dn_maxblkid - blkid;
+
+ /* don't bother after 128,000 blocks */
+ nblks = MIN(nblks, 128*1024);
+ }
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ int i;
+ for (i = 0; i < nblks; i++) {
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ bp += blkid + i;
+ if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+ dprintf_bp(bp, "can free old%s", "");
+ space += bp_get_dasize(spa, bp);
+ }
+ }
+ nblks = 0;
+ }
+
+ while (nblks) {
+ dmu_buf_impl_t *dbuf;
+ int err, epbs, blkoff, tochk;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ blkoff = P2PHASE(blkid, 1<<epbs);
+ tochk = MIN((1<<epbs) - blkoff, nblks);
+
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
+ if (err == 0) {
+ int i;
+ blkptr_t *bp;
+
+ err = dbuf_read(dbuf, NULL,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
+ break;
+ }
+
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds,
+ bp[i].blk_birth)) {
+ dprintf_bp(&bp[i],
+ "can free old%s", "");
+ space += bp_get_dasize(spa, &bp[i]);
+ }
+ }
+ dbuf_rele(dbuf, FTAG);
+ }
+ if (err && err != ENOENT) {
+ txh->txh_tx->tx_err = err;
+ break;
+ }
+
+ blkid += tochk;
+ nblks -= tochk;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ txh->txh_space_tofree += space;
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn;
+ uint64_t start, end, i;
+ int err, shift;
+ zio_t *zio;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_FREE, off, len);
+ if (txh == NULL)
+ return;
+ dn = txh->txh_dnode;
+
+ /* first block */
+ if (off != 0)
+ dmu_tx_count_write(txh, off, 1);
+ /* last block */
+ if (len != DMU_OBJECT_END)
+ dmu_tx_count_write(txh, off+len, 1);
+
+ if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+ return;
+ if (len == DMU_OBJECT_END)
+ len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks, and all the level-1 blocks. The above count_write's
+ * will take care of the level-0 blocks.
+ */
+ if (dn->dn_nlevels > 1) {
+ shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ start = off >> shift;
+ end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+
+ zio = zio_root(tx->tx_pool->dp_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (i = start; i <= end; i++) {
+ uint64_t ibyte = i << shift;
+ err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH)
+ break;
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+ err = zio_wait(zio);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn;
+ uint64_t nblocks;
+ int epbs, err;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_ZAP, add, (uintptr_t)name);
+ if (txh == NULL)
+ return;
+ dn = txh->txh_dnode;
+
+ dmu_tx_count_dnode(txh);
+
+ if (dn == NULL) {
+ /*
+ * We will be able to fit a new object's entries into one leaf
+ * block. So there will be at most 2 blocks total,
+ * including the header block.
+ */
+ dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
+ return;
+ }
+
+ ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+
+ if (dn->dn_maxblkid == 0 && !add) {
+ /*
+ * If there is only one block (i.e. this is a micro-zap)
+ * and we are not adding anything, the accounting is simple.
+ */
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ /*
+ * Use max block size here, since we don't know how much
+ * the size will change between now and the dbuf dirty call.
+ */
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_phys->dn_blkptr[0].blk_birth))
+ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ return;
+ }
+
+ if (dn->dn_maxblkid > 0 && name) {
+ /*
+ * access the name in this fat-zap so that we'll check
+ * for i/o errors to the leaf blocks, etc.
+ */
+ err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+ 8, 0, NULL);
+ if (err == EIO) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ /*
+ * 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
+ */
+ dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
+ (3 + add ? 3 : 0) << dn->dn_datablkshift);
+
+ /*
+ * If the modified blocks are scattered to the four winds,
+ * we'll have to modify an indirect twig for each.
+ */
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+ txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+ dmu_tx_hold_t *txh;
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ DMU_NEW_OBJECT, THT_SPACE, space, 0);
+
+ txh->txh_space_towrite += space;
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+ int holds = 0;
+
+ /*
+ * By asserting that the tx is assigned, we're counting the
+ * number of dn_tx_holds, which is the same as the number of
+ * dn_holds. Otherwise, we'd be counting dn_holds, but
+ * dn_tx_holds could be 0.
+ */
+ ASSERT(tx->tx_txg != 0);
+
+ /* if (tx->tx_anyobj == TRUE) */
+ /* return (0); */
+
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
+ holds++;
+ }
+
+ return (holds);
+}
+
+#ifdef ZFS_DEBUG
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+ dmu_tx_hold_t *txh;
+ int match_object = FALSE, match_offset = FALSE;
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+ ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+ if (tx->tx_anyobj)
+ return;
+
+ /* XXX No checking on the meta dnode for now */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ return;
+
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+ if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+ match_object = TRUE;
+ if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+ int datablkshift = dn->dn_datablkshift ?
+ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int shift = datablkshift + epbs * db->db_level;
+ uint64_t beginblk = shift >= 64 ? 0 :
+ (txh->txh_arg1 >> shift);
+ uint64_t endblk = shift >= 64 ? 0 :
+ ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+ uint64_t blkid = db->db_blkid;
+
+ /* XXX txh_arg2 better not be zero... */
+
+ dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+ txh->txh_type, beginblk, endblk);
+
+ switch (txh->txh_type) {
+ case THT_WRITE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ /*
+ * We will let this hold work for the bonus
+ * buffer so that we don't need to hold it
+ * when creating a new object.
+ */
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
+ case THT_FREE:
+ if (blkid == beginblk &&
+ (txh->txh_arg1 != 0 ||
+ dn->dn_maxblkid == 0))
+ match_offset = TRUE;
+ if (blkid == endblk &&
+ txh->txh_arg2 != DMU_OBJECT_END)
+ match_offset = TRUE;
+ break;
+ case THT_BONUS:
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_ZAP:
+ match_offset = TRUE;
+ break;
+ case THT_NEWOBJECT:
+ match_object = TRUE;
+ break;
+ default:
+ ASSERT(!"bad txh_type");
+ }
+ }
+ if (match_object && match_offset)
+ return;
+ }
+ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+ (u_longlong_t)db->db.db_object, db->db_level,
+ (u_longlong_t)db->db_blkid);
+}
+#endif
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ dmu_tx_hold_t *txh;
+ uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+
+ ASSERT3U(tx->tx_txg, ==, 0);
+ if (tx->tx_err)
+ return (tx->tx_err);
+
+ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+ tx->tx_needassign_txh = NULL;
+
+ /*
+ * NB: No error returns are allowed after txg_hold_open, but
+ * before processing the dnode holds, due to the
+ * dmu_tx_unassign() logic.
+ */
+
+ towrite = tofree = tooverwrite = 0;
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+ if (dn != NULL) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = txh;
+ return (ERESTART);
+ }
+ if (dn->dn_assigned_txg == 0)
+ dn->dn_assigned_txg = tx->tx_txg;
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ towrite += txh->txh_space_towrite;
+ tofree += txh->txh_space_tofree;
+ tooverwrite += txh->txh_space_tooverwrite;
+ }
+
+ /*
+ * NB: This check must be after we've held the dnodes, so that
+ * the dmu_tx_unassign() logic will work properly
+ */
+ if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
+ return (ERESTART);
+
+ /*
+ * If a snapshot has been taken since we made our estimates,
+ * assume that we won't be able to free or overwrite anything.
+ */
+ if (tx->tx_objset &&
+ dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+ tx->tx_lastsnap_txg) {
+ towrite += tooverwrite;
+ tooverwrite = tofree = 0;
+ }
+
+ /*
+ * Convert logical size to worst-case allocated size.
+ */
+ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
+ lsize = towrite + tooverwrite;
+ asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+
+#ifdef ZFS_DEBUG
+ tx->tx_space_towrite = asize;
+ tx->tx_space_tofree = tofree;
+ tx->tx_space_tooverwrite = tooverwrite;
+#endif
+
+ if (tx->tx_dir && asize != 0) {
+ int err = dsl_dir_tempreserve_space(tx->tx_dir,
+ lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+ if (err)
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ if (tx->tx_txg == 0)
+ return;
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ tx->tx_lasttried_txg = tx->tx_txg;
+ tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group. txg_how can be one of:
+ *
+ * (1) TXG_WAIT. If the current open txg is full, waits until there's
+ * a new one. This should be used when you're not holding locks.
+ * If will only fail if we're truly out of space (or over quota).
+ *
+ * (2) TXG_NOWAIT. If we can't assign into the current open txg without
+ * blocking, returns immediately with ERESTART. This should be used
+ * whenever you're holding locks. On an ERESTART error, the caller
+ * should drop locks, do a dmu_tx_wait(tx), and try again.
+ *
+ * (3) A specific txg. Use this if you need to ensure that multiple
+ * transactions all sync in the same txg. Like TXG_NOWAIT, it
+ * returns ERESTART if it can't assign you into the requested txg.
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(txg_how != 0);
+ ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+ while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+ dmu_tx_unassign(tx);
+
+ if (err != ERESTART || txg_how != TXG_WAIT)
+ return (err);
+
+ dmu_tx_wait(tx);
+ }
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(tx->tx_lasttried_txg != 0);
+
+ if (tx->tx_needassign_txh) {
+ dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+ mutex_enter(&dn->dn_mtx);
+ while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+ cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = NULL;
+ } else {
+ txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+ }
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+#ifdef ZFS_DEBUG
+ if (tx->tx_dir == NULL || delta == 0)
+ return;
+
+ if (delta > 0) {
+ ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+ tx->tx_space_towrite);
+ (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+ } else {
+ (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+ }
+#endif
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg != 0);
+
+ while (txh = list_head(&tx->tx_holds)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ dnode_rele(dn, tx);
+ }
+
+ if (tx->tx_tempreserve_cookie)
+ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+ if (tx->tx_anyobj == FALSE)
+ txg_rele_to_sync(&tx->tx_txgh);
+#ifdef ZFS_DEBUG
+ dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+ tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+ tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ while (txh = list_head(&tx->tx_holds)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+#ifdef ZFS_DEBUG
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ return (tx->tx_txg);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
new file mode 100644
index 000000000000..78d625cd41bd
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -0,0 +1,655 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+int zfs_prefetch_disable = 0;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
+ &zfs_prefetch_disable, 0, "Disable prefetch");
+
+/* max # of streams per zfetch */
+uint32_t zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+uint32_t zfetch_block_cap = 256;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+uint64_t zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
+static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear pair of existing prefetch
+ * streams. If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * In other words: if we find two sequential access streams that are
+ * the same length and distance N appart, and this read is N from the
+ * last stream, then we are probably in a strided access pattern. So
+ * combine the two sequential streams into a single strided stream.
+ *
+ * If no co-linear streams are found, return NULL.
+ */
+static int
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+ zstream_t *z_walk;
+ zstream_t *z_comp;
+
+ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+ return (0);
+
+ if (zh == NULL) {
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+ }
+
+ for (z_walk = list_head(&zf->zf_stream); z_walk;
+ z_walk = list_next(&zf->zf_stream, z_walk)) {
+ for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+ z_comp = list_next(&zf->zf_stream, z_comp)) {
+ int64_t diff;
+
+ if (z_walk->zst_len != z_walk->zst_stride ||
+ z_comp->zst_len != z_comp->zst_stride) {
+ continue;
+ }
+
+ diff = z_comp->zst_offset - z_walk->zst_offset;
+ if (z_comp->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+
+ diff = z_walk->zst_offset - z_comp->zst_offset;
+ if (z_walk->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+ }
+ }
+
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch. Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+ uint64_t prefetch_tail;
+ uint64_t prefetch_limit;
+ uint64_t prefetch_ofst;
+ uint64_t prefetch_len;
+ uint64_t blocks_fetched;
+
+ zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+ zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+ prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+ (int64_t)(zs->zst_offset + zs->zst_stride));
+ /*
+ * XXX: use a faster division method?
+ */
+ prefetch_limit = zs->zst_offset + zs->zst_len +
+ (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+ while (prefetch_tail < prefetch_limit) {
+ prefetch_ofst = zs->zst_offset + zs->zst_direction *
+ (prefetch_tail - zs->zst_offset);
+
+ prefetch_len = zs->zst_len;
+
+ /*
+ * Don't prefetch beyond the end of the file, if working
+ * backwards.
+ */
+ if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+ (prefetch_ofst > prefetch_tail)) {
+ prefetch_len += prefetch_ofst;
+ prefetch_ofst = 0;
+ }
+
+ /* don't prefetch more than we're supposed to */
+ if (prefetch_len > zs->zst_len)
+ break;
+
+ blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+ prefetch_ofst, zs->zst_len);
+
+ prefetch_tail += zs->zst_stride;
+ /* stop if we've run out of stuff to prefetch */
+ if (blocks_fetched < zs->zst_len)
+ break;
+ }
+ zs->zst_ph_offset = prefetch_tail;
+ zs->zst_last = lbolt;
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode. It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+ if (zf == NULL) {
+ return;
+ }
+
+ zf->zf_dnode = dno;
+ zf->zf_stream_cnt = 0;
+ zf->zf_alloc_fail = 0;
+
+ list_create(&zf->zf_stream, sizeof (zstream_t),
+ offsetof(zstream_t, zst_node));
+
+ rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+ uint64_t i;
+
+ fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+ for (i = 0; i < fetchsz; i++) {
+ dbuf_prefetch(dn, blkid + i);
+ }
+
+ return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks. This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact. This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+
+ if (blkid > dn->dn_maxblkid) {
+ return (0);
+ }
+
+ /* compute fetch size */
+ if (blkid + nblks + 1 > dn->dn_maxblkid) {
+ fetchsz = (dn->dn_maxblkid - blkid) + 1;
+ ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
+ } else {
+ fetchsz = nblks;
+ }
+
+
+ return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * for this block read. If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static int
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
+{
+ zstream_t *zs;
+ int64_t diff;
+ int reset = !prefetched;
+ int rc = 0;
+
+ if (zh == NULL)
+ return (0);
+
+ /*
+ * XXX: This locking strategy is a bit coarse; however, it's impact has
+ * yet to be tested. If this turns out to be an issue, it can be
+ * modified in a number of different ways.
+ */
+
+ rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ /*
+ * XXX - should this be an assert?
+ */
+ if (zs->zst_len == 0) {
+ /* bogus stream */
+ continue;
+ }
+
+ /*
+ * We hit this case when we are in a strided prefetch stream:
+ * we will read "len" blocks before "striding".
+ */
+ if (zh->zst_offset >= zs->zst_offset &&
+ zh->zst_offset < zs->zst_offset + zs->zst_len) {
+ /* already fetched */
+ rc = 1;
+ goto out;
+ }
+
+ /*
+ * This is the forward sequential read case: we increment
+ * len by one each time we hit here, so we will enter this
+ * case on every read.
+ */
+ if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+
+ reset = !prefetched && zs->zst_len > 1;
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+ zs->zst_len += zh->zst_len;
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_offset += diff;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : 0;
+ }
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ /*
+ * Same as above, but reading backwards through the file.
+ */
+ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+ /* backwards sequential access */
+
+ reset = !prefetched && zs->zst_len > 1;
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zh->zst_len ?
+ zs->zst_offset - zh->zst_len : 0;
+ zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+ zs->zst_ph_offset - zh->zst_len : 0;
+ zs->zst_len += zh->zst_len;
+
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+ zs->zst_ph_offset - diff : 0;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : zs->zst_len;
+ }
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided forward access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset += zs->zst_stride;
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided reverse access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+ zs->zst_offset - zs->zst_stride : 0;
+ zs->zst_ph_offset = (zs->zst_ph_offset >
+ (2 * zs->zst_stride)) ?
+ (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+ }
+ }
+
+ if (zs) {
+ if (reset) {
+ zstream_t *remove = zs;
+
+ rc = 0;
+ mutex_exit(&zs->zst_lock);
+ rw_exit(&zf->zf_rwlock);
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ /*
+ * Relocate the stream, in case someone removes
+ * it while we were acquiring the WRITER lock.
+ */
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (zs == remove) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ break;
+ }
+ }
+ } else {
+ rc = 1;
+ dmu_zfetch_dofetch(zf, zs);
+ mutex_exit(&zs->zst_lock);
+ }
+ }
+out:
+ rw_exit(&zf->zf_rwlock);
+ return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure. This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice. This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+ zstream_t *zs;
+ zstream_t *zs_next;
+
+ ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+ for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+
+ list_remove(&zf->zf_stream, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ }
+ list_destroy(&zf->zf_stream);
+ rw_destroy(&zf->zf_rwlock);
+
+ zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure. Peform the appropriate
+ * book-keeping. It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case. If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+ zstream_t *zs_walk;
+ zstream_t *zs_next;
+
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs_walk);
+
+ if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+ return (0);
+ }
+ }
+
+ list_insert_head(&zf->zf_stream, zs);
+ zf->zf_stream_cnt++;
+
+ return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+ zstream_t *zs;
+
+ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+ return (0);
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+ break;
+ }
+
+ if (zs) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ bzero(zs, sizeof (zstream_t));
+ } else {
+ zf->zf_alloc_fail++;
+ }
+ rw_exit(&zf->zf_rwlock);
+
+ return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure. Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ list_remove(&zf->zf_stream, zs);
+ zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+ if (zs1->zst_offset != zs2->zst_offset)
+ return (0);
+
+ if (zs1->zst_len != zs2->zst_len)
+ return (0);
+
+ if (zs1->zst_stride != zs2->zst_stride)
+ return (0);
+
+ if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+ return (0);
+
+ if (zs1->zst_cap != zs2->zst_cap)
+ return (0);
+
+ if (zs1->zst_direction != zs2->zst_direction)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * This is the prefetch entry point. It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+{
+ zstream_t zst;
+ zstream_t *newstream;
+ int fetched;
+ int inserted;
+ unsigned int blkshft;
+ uint64_t blksz;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ /* files that aren't ln2 blocksz are only one block -- nothing to do */
+ if (!zf->zf_dnode->dn_datablkshift)
+ return;
+
+ /* convert offset and size, into blockid and nblocks */
+ blkshft = zf->zf_dnode->dn_datablkshift;
+ blksz = (1 << blkshft);
+
+ bzero(&zst, sizeof (zstream_t));
+ zst.zst_offset = offset >> blkshft;
+ zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+ P2ALIGN(offset, blksz)) >> blkshft;
+
+ fetched = dmu_zfetch_find(zf, &zst, prefetched);
+ if (!fetched) {
+ fetched = dmu_zfetch_colinear(zf, &zst);
+ }
+
+ if (!fetched) {
+ newstream = dmu_zfetch_stream_reclaim(zf);
+
+ /*
+ * we still couldn't find a stream, drop the lock, and allocate
+ * one if possible. Otherwise, give up and go home.
+ */
+ if (newstream == NULL) {
+ uint64_t maxblocks;
+ uint32_t max_streams;
+ uint32_t cur_streams;
+
+ cur_streams = zf->zf_stream_cnt;
+ maxblocks = zf->zf_dnode->dn_maxblkid;
+
+ max_streams = MIN(zfetch_max_streams,
+ (maxblocks / zfetch_block_cap));
+ if (max_streams == 0) {
+ max_streams++;
+ }
+
+ if (cur_streams >= max_streams) {
+ return;
+ }
+
+ newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+ }
+
+ newstream->zst_offset = zst.zst_offset;
+ newstream->zst_len = zst.zst_len;
+ newstream->zst_stride = zst.zst_len;
+ newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+ newstream->zst_cap = zst.zst_len;
+ newstream->zst_direction = ZFETCH_FORWARD;
+ newstream->zst_last = lbolt;
+
+ mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ inserted = dmu_zfetch_stream_insert(zf, newstream);
+ rw_exit(&zf->zf_rwlock);
+
+ if (!inserted) {
+ mutex_destroy(&newstream->zst_lock);
+ kmem_free(newstream, sizeof (zstream_t));
+ }
+ }
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c
new file mode 100644
index 000000000000..65bb51870247
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -0,0 +1,1370 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static int free_range_compar(const void *node1, const void *node2);
+
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+ int i;
+ dnode_t *dn = arg;
+ bzero(dn, sizeof (dnode_t));
+
+ cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ refcount_create(&dn->dn_holds);
+ refcount_create(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_create(&dn->dn_ranges[i], free_range_compar,
+ sizeof (free_range_t),
+ offsetof(struct free_range, fr_node));
+ list_create(&dn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+
+ list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+ int i;
+ dnode_t *dn = arg;
+
+ cv_destroy(&dn->dn_notxholds);
+ rw_destroy(&dn->dn_struct_rwlock);
+ mutex_destroy(&dn->dn_mtx);
+ mutex_destroy(&dn->dn_dbufs_mtx);
+ refcount_destroy(&dn->dn_holds);
+ refcount_destroy(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_destroy(&dn->dn_ranges[i]);
+ list_destroy(&dn->dn_dirty_records[i]);
+ }
+
+ list_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+ dnode_cache = kmem_cache_create("dnode_t",
+ sizeof (dnode_t),
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+}
+
+void
+dnode_fini(void)
+{
+ kmem_cache_destroy(dnode_cache);
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+ int drop_struct_lock = FALSE;
+
+ ASSERT(dn->dn_phys);
+ ASSERT(dn->dn_objset);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
+ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+ int i;
+ ASSERT3U(dn->dn_indblkshift, >=, 0);
+ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+ if (dn->dn_datablkshift) {
+ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+ ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+ }
+ ASSERT3U(dn->dn_nlevels, <=, 30);
+ ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+ ASSERT3U(dn->dn_nblkptr, >=, 1);
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+ ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_datablksz, ==,
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+ dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+ }
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE)
+ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
+ if (dn->dn_dbuf != NULL) {
+ ASSERT3P(dn->dn_phys, ==,
+ (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+ uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+ int i;
+
+ if (dnp->dn_type == DMU_OT_NONE) {
+ bzero(dnp, sizeof (dnode_phys_t));
+ return;
+ }
+
+ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+ dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+ /*
+ * dn_nblkptr is only one byte, so it's OK to read it in either
+ * byte order. We can't read dn_bouslen.
+ */
+ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+ buf64[i] = BSWAP_64(buf64[i]);
+
+ /*
+ * OK to check dn_bonuslen for zero, because it won't matter if
+ * we have the wrong byte order. This is necessary because the
+ * dnode dnode is smaller than a regular dnode.
+ */
+ if (dnp->dn_bonuslen != 0) {
+ /*
+ * Note that the bonus length calculated here may be
+ * longer than the actual bonus buffer. This is because
+ * we always put the bonus buffer after the last block
+ * pointer (instead of packing it against the end of the
+ * dnode buffer).
+ */
+ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+ size_t len = DN_MAX_BONUSLEN - off;
+ ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
+ dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+ }
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+ dnode_phys_t *buf = vbuf;
+ int i;
+
+ ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+ ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+ size >>= DNODE_SHIFT;
+ for (i = 0; i < size; i++) {
+ dnode_byteswap(buf);
+ buf++;
+ }
+}
+
+static int
+free_range_compar(const void *node1, const void *node2)
+{
+ const free_range_t *rp1 = node1;
+ const free_range_t *rp2 = node2;
+
+ if (rp1->fr_blkid < rp2->fr_blkid)
+ return (-1);
+ else if (rp1->fr_blkid > rp2->fr_blkid)
+ return (1);
+ else return (0);
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+ ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+ dn->dn_datablksz = size;
+ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+ dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object)
+{
+ dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ (void) dnode_cons(dn, NULL, 0); /* XXX */
+
+ dn->dn_objset = os;
+ dn->dn_object = object;
+ dn->dn_dbuf = db;
+ dn->dn_phys = dnp;
+
+ if (dnp->dn_datablkszsec)
+ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_indblkshift = dnp->dn_indblkshift;
+ dn->dn_nlevels = dnp->dn_nlevels;
+ dn->dn_type = dnp->dn_type;
+ dn->dn_nblkptr = dnp->dn_nblkptr;
+ dn->dn_checksum = dnp->dn_checksum;
+ dn->dn_compress = dnp->dn_compress;
+ dn->dn_bonustype = dnp->dn_bonustype;
+ dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_maxblkid = dnp->dn_maxblkid;
+
+ dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+ mutex_enter(&os->os_lock);
+ list_insert_head(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ return (dn);
+}
+
+static void
+dnode_destroy(dnode_t *dn)
+{
+ objset_impl_t *os = dn->dn_objset;
+
+#ifdef ZFS_DEBUG
+ int i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
+ ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
+ }
+ ASSERT(NULL == list_head(&dn->dn_dbufs));
+#endif
+
+ mutex_enter(&os->os_lock);
+ list_remove(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+ dmu_zfetch_rele(&dn->dn_zfetch);
+ if (dn->dn_bonus) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ kmem_cache_free(dnode_cache, dn);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i;
+
+ if (blocksize == 0)
+ blocksize = 1 << zfs_default_bs;
+ else if (blocksize > SPA_MAXBLOCKSIZE)
+ blocksize = SPA_MAXBLOCKSIZE;
+ else
+ blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = zfs_default_ibs;
+
+ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+ dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+ ASSERT(ot != DMU_OT_NONE);
+ ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT3U(dn->dn_maxblkid, ==, 0);
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+ ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+ ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+ }
+
+ dn->dn_type = ot;
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_indblkshift = ibs;
+ dn->dn_nlevels = 1;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ dn->dn_dirtyctx = 0;
+
+ dn->dn_free_txg = 0;
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+
+ dnode_setdirty(dn, tx);
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i;
+ dmu_buf_impl_t *db = NULL;
+
+ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+ ASSERT(tx->tx_txg != 0);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+
+ for (i = 0; i < TXG_SIZE; i++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+
+ /* clean up any unreferenced dbufs */
+ (void) dnode_evict_dbufs(dn, 0);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX I should really have a generation number to tell if we
+ * need to do this...
+ */
+ if (blocksize != dn->dn_datablksz ||
+ dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
+ /* free all old data */
+ dnode_free_range(dn, 0, -1ULL, tx);
+ }
+
+ /* change blocksize */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (blocksize != dn->dn_datablksz &&
+ (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+ list_head(&dn->dn_dbufs) != NULL)) {
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, blocksize, tx);
+ }
+ dnode_setdblksz(dn, blocksize);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db) {
+ dbuf_rele(db, FTAG);
+ db = NULL;
+ }
+
+ /* change type */
+ dn->dn_type = ot;
+
+ if (dn->dn_bonuslen != bonuslen) {
+ /* change bonus size */
+ if (bonuslen == 0)
+ bonuslen = 1; /* XXX */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dn->dn_bonus = dbuf_create_bonus(dn);
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+ if (refcount_add(&db->db_holds, FTAG) == 1)
+ dnode_add_ref(dn, db);
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT(db->db.db_data != NULL);
+ db->db.db_size = bonuslen;
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ }
+
+ /* change bonus size and type */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+ /*
+ * NB: we have to do the dbuf_rele after we've changed the
+ * dn_bonuslen, for the sake of dbuf_verify().
+ */
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+}
+
+void
+dnode_special_close(dnode_t *dn)
+{
+ /*
+ * Wait for final references to the dnode to clear. This can
+ * only happen if the arc is asyncronously evicting state that
+ * has a hold on this dnode while we are trying to evict this
+ * dnode.
+ */
+ while (refcount_count(&dn->dn_holds) > 0)
+ delay(1);
+ dnode_destroy(dn);
+}
+
+dnode_t *
+dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+{
+ dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ DNODE_VERIFY(dn);
+ return (dn);
+}
+
+static void
+dnode_buf_pageout(dmu_buf_t *db, void *arg)
+{
+ dnode_t **children_dnodes = arg;
+ int i;
+ int epb = db->db_size >> DNODE_SHIFT;
+
+ for (i = 0; i < epb; i++) {
+ dnode_t *dn = children_dnodes[i];
+ int n;
+
+ if (dn == NULL)
+ continue;
+#ifdef ZFS_DEBUG
+ /*
+ * If there are holds on this dnode, then there should
+ * be holds on the dnode's containing dbuf as well; thus
+ * it wouldn't be eligable for eviction and this function
+ * would not have been called.
+ */
+ ASSERT(refcount_is_zero(&dn->dn_holds));
+ ASSERT(list_head(&dn->dn_dbufs) == NULL);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+ for (n = 0; n < TXG_SIZE; n++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
+#endif
+ children_dnodes[i] = NULL;
+ dnode_destroy(dn);
+ }
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+}
+
+/*
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+ void *tag, dnode_t **dnp)
+{
+ int epb, idx, err;
+ int drop_struct_lock = FALSE;
+ int type;
+ uint64_t blk;
+ dnode_t *mdn, *dn;
+ dmu_buf_impl_t *db;
+ dnode_t **children_dnodes;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return (EINVAL);
+
+ mdn = os->os_meta_dnode;
+
+ DNODE_VERIFY(mdn);
+
+ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+ rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+ db = dbuf_hold(mdn, blk, FTAG);
+ if (drop_struct_lock)
+ rw_exit(&mdn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, FTAG);
+ return (err);
+ }
+
+ ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+ epb = db->db.db_size >> DNODE_SHIFT;
+
+ idx = object & (epb-1);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ if (children_dnodes == NULL) {
+ dnode_t **winner;
+ children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
+ KM_SLEEP);
+ if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+ dnode_buf_pageout)) {
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ children_dnodes = winner;
+ }
+ }
+
+ if ((dn = children_dnodes[idx]) == NULL) {
+ dnode_t *winner;
+ dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
+ db, object);
+ winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ if (winner != NULL) {
+ dnode_destroy(dn);
+ dn = winner;
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ type = dn->dn_type;
+ if (dn->dn_free_txg ||
+ ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+ ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
+ mutex_exit(&dn->dn_mtx);
+ dbuf_rele(db, FTAG);
+ return (type == DMU_OT_NONE ? ENOENT : EEXIST);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (refcount_add(&dn->dn_holds, tag) == 1)
+ dbuf_add_ref(db, dn);
+
+ DNODE_VERIFY(dn);
+ ASSERT3P(dn->dn_dbuf, ==, db);
+ ASSERT3U(dn->dn_object, ==, object);
+ dbuf_rele(db, FTAG);
+
+ *dnp = dn;
+ return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+}
+
+void
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+ ASSERT(refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, tag);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+ uint64_t refs;
+
+ refs = refcount_remove(&dn->dn_holds, tag);
+ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+ if (refs == 0 && dn->dn_dbuf)
+ dbuf_rele(dn->dn_dbuf, dn);
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ return;
+
+ DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+ /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ mutex_exit(&dn->dn_mtx);
+#endif
+
+ mutex_enter(&os->os_lock);
+
+ /*
+ * If we are already marked dirty, we're done.
+ */
+ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+ mutex_exit(&os->os_lock);
+ return;
+ }
+
+ ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+
+ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+ dn->dn_object, txg);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+ list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+ } else {
+ list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+ }
+
+ mutex_exit(&os->os_lock);
+
+ /*
+ * The dnode maintains a hold on its containing dbuf as
+ * long as there are holds on it. Each instantiated child
+ * dbuf maintaines a hold on the dnode. When the last child
+ * drops its hold, the dnode will drop its hold on the
+ * containing dbuf. We add a "dirty hold" here so that the
+ * dnode will hang around after we finish processing its
+ * children.
+ */
+ dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+
+ (void) dbuf_dirty(dn->dn_dbuf, tx);
+
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+ /* we should be the only holder... hopefully */
+ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+ dn->dn_free_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If the dnode is already dirty, it needs to be moved from
+ * the dirty list to the free list.
+ */
+ mutex_enter(&dn->dn_objset->os_lock);
+ if (list_link_active(&dn->dn_dirty_link[txgoff])) {
+ list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
+ list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
+ mutex_exit(&dn->dn_objset->os_lock);
+ } else {
+ mutex_exit(&dn->dn_objset->os_lock);
+ dnode_setdirty(dn, tx);
+ }
+}
+
+/*
+ * Try to change the block size for the indicated dnode. This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ int have_db0 = FALSE;
+
+ if (size == 0)
+ size = SPA_MINBLOCKSIZE;
+ if (size > SPA_MAXBLOCKSIZE)
+ size = SPA_MAXBLOCKSIZE;
+ else
+ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+ if (ibs == dn->dn_indblkshift)
+ ibs = 0;
+
+ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ return (0);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* Check for any allocated blocks beyond the first */
+ if (dn->dn_phys->dn_maxblkid != 0)
+ goto fail;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+
+ if (db->db_blkid == 0) {
+ have_db0 = TRUE;
+ } else if (db->db_blkid != DB_BONUS_BLKID) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ goto fail;
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (ibs && dn->dn_nlevels != 1)
+ goto fail;
+
+ db = NULL;
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
+ /* obtain the old block */
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, size, tx);
+ }
+
+ dnode_setdblksz(dn, size);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (ibs) {
+ dn->dn_indblkshift = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+
+fail:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ENOTSUP);
+}
+
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int drop_struct_lock = FALSE;
+ int epbs, new_nlevels;
+ uint64_t sz;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (blkid <= dn->dn_maxblkid)
+ goto out;
+
+ dn->dn_maxblkid = blkid;
+
+ /*
+ * Compute the number of levels necessary to support the new maxblkid.
+ */
+ new_nlevels = 1;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (sz = dn->dn_nblkptr;
+ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+ new_nlevels++;
+
+ if (new_nlevels > dn->dn_nlevels) {
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+out:
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+ avl_index_t where;
+ free_range_t *rp;
+ free_range_t rp_tofind;
+ uint64_t endblk = blkid + nblks;
+
+ ASSERT(MUTEX_HELD(&dn->dn_mtx));
+ ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ rp_tofind.fr_blkid = blkid;
+ rp = avl_find(tree, &rp_tofind, &where);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_BEFORE);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_AFTER);
+
+ while (rp && (rp->fr_blkid <= blkid + nblks)) {
+ uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
+ free_range_t *nrp = AVL_NEXT(tree, rp);
+
+ if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
+ /* clear this entire range */
+ avl_remove(tree, rp);
+ kmem_free(rp, sizeof (free_range_t));
+ } else if (blkid <= rp->fr_blkid &&
+ endblk > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear the beginning of this range */
+ rp->fr_blkid = endblk;
+ rp->fr_nblks = fr_endblk - endblk;
+ } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
+ endblk >= fr_endblk) {
+ /* clear the end of this range */
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear a chunk out of this range */
+ free_range_t *new_rp =
+ kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+
+ new_rp->fr_blkid = endblk;
+ new_rp->fr_nblks = fr_endblk - endblk;
+ avl_insert_here(tree, new_rp, rp, AVL_AFTER);
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ }
+ /* there may be no overlap */
+ rp = nrp;
+ }
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint64_t blkoff, blkid, nblks;
+ int blksz, head;
+ int trunc = FALSE;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ blksz = dn->dn_datablksz;
+
+ /* If the range is past the end of the file, this is a no-op */
+ if (off >= blksz * (dn->dn_maxblkid+1))
+ goto out;
+ if (len == -1ULL) {
+ len = UINT64_MAX - off;
+ trunc = TRUE;
+ }
+
+ /*
+ * First, block align the region to free:
+ */
+ if (ISP2(blksz)) {
+ head = P2NPHASE(off, blksz);
+ blkoff = P2PHASE(off, blksz);
+ } else {
+ ASSERT(dn->dn_maxblkid == 0);
+ if (off == 0 && len >= blksz) {
+ /* Freeing the whole block; don't do any head. */
+ head = 0;
+ } else {
+ /* Freeing part of the block. */
+ head = blksz - off;
+ ASSERT3U(head, >, 0);
+ }
+ blkoff = off;
+ }
+ /* zero out any partial block data at the start of the range */
+ if (head) {
+ ASSERT3U(blkoff + head, ==, blksz);
+ if (len < head)
+ head = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+ FTAG, &db) == 0) {
+ caddr_t data;
+
+ /* don't dirty if it isn't on disk and isn't dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ data = db->db.db_data;
+ bzero(data + blkoff, head);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ off += head;
+ len -= head;
+ }
+
+ /* If the range was less than one block, we're done */
+ if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+ goto out;
+
+ if (!ISP2(blksz)) {
+ /*
+ * They are freeing the whole block of a
+ * non-power-of-two blocksize file. Skip all the messy
+ * math.
+ */
+ ASSERT3U(off, ==, 0);
+ ASSERT3U(len, >=, blksz);
+ blkid = 0;
+ nblks = 1;
+ } else {
+ int tail;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int blkshift = dn->dn_datablkshift;
+
+ /* If the remaining range is past end of file, we're done */
+ if (off > dn->dn_maxblkid << blkshift)
+ goto out;
+
+ if (off + len == UINT64_MAX)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr &&
+ !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock,
+ RW_WRITER);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ len -= tail;
+ }
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ /* dirty the left indirects */
+ if (dn->dn_nlevels > 1 && off != 0) {
+ db = dbuf_hold_level(dn, 1,
+ (off - head) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+
+ /* dirty the right indirects */
+ if (dn->dn_nlevels > 1 && !trunc) {
+ db = dbuf_hold_level(dn, 1,
+ (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+
+ /*
+ * Finally, add this range to the dnode range list, we
+ * will finish up this free operation in the syncing phase.
+ */
+ ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
+ ASSERT(off + len == UINT64_MAX ||
+ IS_P2ALIGNED(len, 1<<blkshift));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+
+ if (trunc)
+ dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, blkid, nblks, tx);
+ {
+ free_range_t *rp, *found;
+ avl_index_t where;
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+
+ /* Add new range to dn_ranges */
+ rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+ rp->fr_blkid = blkid;
+ rp->fr_nblks = nblks;
+ found = avl_find(tree, rp, &where);
+ ASSERT(found == NULL);
+ avl_insert(tree, rp, where);
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ dbuf_free_range(dn, blkid, nblks, tx);
+ dnode_setdirty(dn, tx);
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+ free_range_t range_tofind;
+ void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+ int i;
+
+ if (blkid == DB_BONUS_BLKID)
+ return (FALSE);
+
+ /*
+ * If we're in the process of opening the pool, dp will not be
+ * set yet, but there shouldn't be anything dirty.
+ */
+ if (dp == NULL)
+ return (FALSE);
+
+ if (dn->dn_free_txg)
+ return (TRUE);
+
+ /*
+ * If dn_datablkshift is not set, then there's only a single
+ * block, in which case there will never be a free range so it
+ * won't matter.
+ */
+ range_tofind.fr_blkid = blkid;
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ free_range_t *range_found;
+ avl_index_t idx;
+
+ range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
+ if (range_found) {
+ ASSERT(range_found->fr_nblks > 0);
+ break;
+ }
+ range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
+ if (range_found &&
+ range_found->fr_blkid + range_found->fr_nblks > blkid)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+ uint64_t space;
+ dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+ dn, dn->dn_phys,
+ (u_longlong_t)dn->dn_phys->dn_used,
+ (longlong_t)delta);
+
+ mutex_enter(&dn->dn_mtx);
+ space = DN_USED_BYTES(dn->dn_phys);
+ if (delta > 0) {
+ ASSERT3U(space + delta, >=, space); /* no overflow */
+ } else {
+ ASSERT3U(space, >=, -delta); /* no underflow */
+ }
+ space += delta;
+ if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
+ ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+ ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
+ dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+ } else {
+ dn->dn_phys->dn_used = space;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+ }
+ mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context.
+ * Be conservative (ie. OK to write less than this or free more than
+ * this, but don't write more or free less).
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ if (space > 0)
+ space = spa_get_asize(os->os_spa, space);
+
+ if (ds)
+ dsl_dir_willuse_space(ds->ds_dir, space, tx);
+
+ dmu_tx_willuse_space(tx, space);
+}
+
+static int
+dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int lvl, uint64_t blkfill, uint64_t txg)
+{
+ dmu_buf_impl_t *db = NULL;
+ void *data = NULL;
+ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t epb = 1ULL << epbs;
+ uint64_t minfill, maxfill;
+ int i, error, span;
+
+ dprintf("probing object %llu offset %llx level %d of %u\n",
+ dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+ if (lvl == dn->dn_phys->dn_nlevels) {
+ error = 0;
+ epb = dn->dn_phys->dn_nblkptr;
+ data = dn->dn_phys->dn_blkptr;
+ } else {
+ uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ if (error) {
+ if (error == ENOENT)
+ return (hole ? 0 : ESRCH);
+ return (error);
+ }
+ error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+ if (error) {
+ dbuf_rele(db, FTAG);
+ return (error);
+ }
+ data = db->db.db_data;
+ }
+
+ if (db && txg &&
+ (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+ error = ESRCH;
+ } else if (lvl == 0) {
+ dnode_phys_t *dnp = data;
+ span = DNODE_SHIFT;
+ ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+ for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ boolean_t newcontents = B_TRUE;
+ if (txg) {
+ int j;
+ newcontents = B_FALSE;
+ for (j = 0; j < dnp[i].dn_nblkptr; j++) {
+ if (dnp[i].dn_blkptr[j].blk_birth > txg)
+ newcontents = B_TRUE;
+ }
+ }
+ if (!dnp[i].dn_type == hole && newcontents)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i == blkfill)
+ error = ESRCH;
+ } else {
+ blkptr_t *bp = data;
+ span = (lvl - 1) * epbs + dn->dn_datablkshift;
+ minfill = 0;
+ maxfill = blkfill << ((lvl - 1) * epbs);
+
+ if (hole)
+ maxfill--;
+ else
+ minfill++;
+
+ for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+ i < epb; i++) {
+ if (bp[i].blk_fill >= minfill &&
+ bp[i].blk_fill <= maxfill &&
+ bp[i].blk_birth > txg)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i >= epb)
+ error = ESRCH;
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, hole, offset, 1, 1, 0);
+ * Finds the next hole/data in a file.
+ * Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * Finds the next free/allocated dnode an objset's meta-dnode.
+ * Only finds objects that have new contents since txg (ie.
+ * bonus buffer changes and content removal are ignored).
+ * Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ * Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int minlvl, uint64_t blkfill, uint64_t txg)
+{
+ int lvl, maxlvl;
+ int error = 0;
+ uint64_t initial_offset = *offset;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ESRCH);
+ }
+
+ if (dn->dn_datablkshift == 0) {
+ if (*offset < dn->dn_datablksz) {
+ if (hole)
+ *offset = dn->dn_datablksz;
+ } else {
+ error = ESRCH;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ return (error);
+ }
+
+ maxlvl = dn->dn_phys->dn_nlevels;
+
+ for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ error = dnode_next_offset_level(dn,
+ hole, offset, lvl, blkfill, txg);
+ if (error != ESRCH)
+ break;
+ }
+
+ while (--lvl >= minlvl && error == 0) {
+ error = dnode_next_offset_level(dn,
+ hole, offset, lvl, blkfill, txg);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (error == 0 && initial_offset > *offset)
+ error = ESRCH;
+
+ return (error);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
new file mode 100644
index 000000000000..08f60e8242d7
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -0,0 +1,621 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ int nblkptr = dn->dn_phys->dn_nblkptr;
+ int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+ int new_level = dn->dn_next_nlevels[txgoff];
+ int i;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* this dnode can't be paged out because it's dirty */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+
+ dn->dn_phys->dn_nlevels = new_level;
+ dprintf("os=%p obj=%llu, increase to %d\n",
+ dn->dn_objset, dn->dn_object,
+ dn->dn_phys->dn_nlevels);
+
+ /* check for existing blkptrs in the dnode */
+ for (i = 0; i < nblkptr; i++)
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+ break;
+ if (i != nblkptr) {
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ /* set dbuf's parent pointers to new indirect buf */
+ for (i = 0; i < nblkptr; i++) {
+ dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+
+ if (child == NULL)
+ continue;
+ ASSERT3P(child->db_dnode, ==, dn);
+ if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+ ASSERT(child->db_parent->db_level == db->db_level);
+ ASSERT(child->db_blkptr !=
+ &dn->dn_phys->dn_blkptr[child->db_blkid]);
+ mutex_exit(&child->db_mtx);
+ continue;
+ }
+ ASSERT(child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf);
+
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data)
+ child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+ else
+ child->db_blkptr = NULL;
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
+
+ mutex_exit(&child->db_mtx);
+ }
+
+ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t bytesfreed = 0;
+ int i;
+
+ dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+
+ for (i = 0; i < num; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+
+ bytesfreed += bp_get_dasize(os->os_spa, bp);
+ ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+ dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
+ bzero(bp, sizeof (blkptr_t));
+ }
+ dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+ int off, num;
+ int i, err, epbs;
+ uint64_t txg = tx->tx_txg;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ off = start - (db->db_blkid * 1<<epbs);
+ num = end - start + 1;
+
+ ASSERT3U(off, >=, 0);
+ ASSERT3U(num, >=, 0);
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+ ASSERT(db->db_blkptr != NULL);
+
+ for (i = off; i < off+num; i++) {
+ uint64_t *buf;
+ dmu_buf_impl_t *child;
+ dbuf_dirty_record_t *dr;
+ int j;
+
+ ASSERT(db->db_level == 1);
+
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ if (err == ENOENT)
+ continue;
+ ASSERT(err == 0);
+ ASSERT(child->db_level == 0);
+ dr = child->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ ASSERT(dr == NULL || dr->dr_txg == txg);
+
+ /* data_old better be zeroed */
+ if (dr) {
+ buf = dr->dt.dl.dr_data->b_data;
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+
+ /*
+ * db_data better be zeroed unless it's dirty in a
+ * future txg.
+ */
+ mutex_enter(&child->db_mtx);
+ buf = child->db.db_data;
+ if (buf != NULL && child->db_state != DB_FILL &&
+ child->db_last_dirty == NULL) {
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+ mutex_exit(&child->db_mtx);
+
+ dbuf_rele(child, FTAG);
+ }
+}
+#endif
+
+static int
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ blkptr_t *bp;
+ dmu_buf_impl_t *subdb;
+ uint64_t start, end, dbstart, dbend, i;
+ int epbs, shift, err;
+ int all = TRUE;
+
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ arc_release(db->db_buf, db);
+ bp = (blkptr_t *)db->db.db_data;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ shift = (db->db_level - 1) * epbs;
+ dbstart = db->db_blkid << epbs;
+ start = blkid >> shift;
+ if (dbstart < start) {
+ bp += start - dbstart;
+ all = FALSE;
+ } else {
+ start = dbstart;
+ }
+ dbend = ((db->db_blkid + 1) << epbs) - 1;
+ end = (blkid + nblks - 1) >> shift;
+ if (dbend <= end)
+ end = dbend;
+ else if (all)
+ all = trunc;
+ ASSERT3U(start, <=, end);
+
+ if (db->db_level == 1) {
+ FREE_VERIFY(db, start, end, tx);
+ free_blocks(dn, bp, end-start+1, tx);
+ arc_buf_freeze(db->db_buf);
+ ASSERT(all || db->db_last_dirty);
+ return (all);
+ }
+
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ ASSERT3P(subdb->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ } else {
+ all = FALSE;
+ }
+ dbuf_rele(subdb, FTAG);
+ }
+ arc_buf_freeze(db->db_buf);
+#ifdef ZFS_DEBUG
+ bp -= (end-start)+1;
+ for (i = start; i <= end; i++, bp++) {
+ if (i == start && blkid != 0)
+ continue;
+ else if (i == end && !trunc)
+ continue;
+ ASSERT3U(bp->blk_birth, ==, 0);
+ }
+#endif
+ ASSERT(all || db->db_last_dirty);
+ return (all);
+}
+
+/*
+ * free_range: Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ dmu_buf_impl_t *db;
+ int trunc, start, end, shift, i, err;
+ int dnlevel = dn->dn_phys->dn_nlevels;
+
+ if (blkid > dn->dn_phys->dn_maxblkid)
+ return;
+
+ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+ trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
+ if (trunc)
+ nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+
+ /* There are no indirect blocks in the object */
+ if (dnlevel == 1) {
+ if (blkid >= dn->dn_phys->dn_nblkptr) {
+ /* this range was never made persistent */
+ return;
+ }
+ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+ free_blocks(dn, bp + blkid, nblks, tx);
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off,
+ 1, 1, 0) != 0);
+ }
+ return;
+ }
+
+ shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ start = blkid >> shift;
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ end = (blkid + nblks - 1) >> shift;
+ bp += start;
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(db, blkid, nblks, trunc, tx)) {
+ ASSERT3P(db->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+ }
+}
+
+/*
+ * Try to kick all the dnodes dbufs out of the cache...
+ */
+int
+dnode_evict_dbufs(dnode_t *dn, int try)
+{
+ int progress;
+ int pass = 0;
+
+ do {
+ dmu_buf_impl_t *db, marker;
+ int evicting = FALSE;
+
+ progress = FALSE;
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_insert_tail(&dn->dn_dbufs, &marker);
+ db = list_head(&dn->dn_dbufs);
+ for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
+ list_remove(&dn->dn_dbufs, db);
+ list_insert_tail(&dn->dn_dbufs, db);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_EVICTING) {
+ progress = TRUE;
+ evicting = TRUE;
+ mutex_exit(&db->db_mtx);
+ } else if (refcount_is_zero(&db->db_holds)) {
+ progress = TRUE;
+ ASSERT(!arc_released(db->db_buf));
+ dbuf_clear(db); /* exits db_mtx for us */
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
+ }
+ list_remove(&dn->dn_dbufs, &marker);
+ /*
+ * NB: we need to drop dn_dbufs_mtx between passes so
+ * that any DB_EVICTING dbufs can make progress.
+ * Ideally, we would have some cv we could wait on, but
+ * since we don't, just wait a bit to give the other
+ * thread a chance to run.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ if (evicting)
+ delay(1);
+ pass++;
+ ASSERT(pass < 100); /* sanity check */
+ } while (progress);
+
+ /*
+ * This function works fine even if it can't evict everything.
+ * If were only asked to try to evict everything then
+ * return an error if we can't. Otherwise panic as the caller
+ * expects total eviction.
+ */
+ if (list_head(&dn->dn_dbufs) != NULL) {
+ if (try) {
+ return (1);
+ } else {
+ panic("dangling dbufs (dn=%p, dbuf=%p)\n",
+ dn, list_head(&dn->dn_dbufs));
+ }
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(list, dr);
+ ASSERT(db->db_last_dirty == dr);
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt -= 1;
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_data == db->db_buf);
+ dbuf_unoverride(dr);
+ mutex_exit(&db->db_mtx);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ }
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+ (void) dnode_evict_dbufs(dn, 0);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX - It would be nice to assert this, but we may still
+ * have residual holds from async evictions from the arc...
+ *
+ * zfs_obj_to_path() also depends on this being
+ * commented out.
+ *
+ * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+ */
+
+ /* Undirty next bits */
+ dn->dn_next_nlevels[txgoff] = 0;
+ dn->dn_next_indblkshift[txgoff] = 0;
+ dn->dn_next_blksz[txgoff] = 0;
+
+ /* free up all the blocks in the file. */
+ dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
+ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+
+ /* ASSERT(blkptrs are zero); */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ ASSERT(dn->dn_free_txg > 0);
+ if (dn->dn_allocated_txg != dn->dn_free_txg)
+ dbuf_will_dirty(dn->dn_dbuf, tx);
+ bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_type = DMU_OT_NONE;
+ dn->dn_maxblkid = 0;
+ dn->dn_allocated_txg = 0;
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ /*
+ * Now that we've released our hold, the dnode may
+ * be evicted, so we musn't access it.
+ */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ *
+ * NOTE: The dnode is kept in memory by being dirty. Once the
+ * dirty bit is cleared, it may be evicted. Beware of this!
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+ free_range_t *rp;
+ dnode_phys_t *dnp = dn->dn_phys;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ list_t *list = &dn->dn_dirty_records[txgoff];
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ DNODE_VERIFY(dn);
+
+ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_allocated_txg == tx->tx_txg) {
+ /* The dnode is newly allocated or reallocated */
+ if (dnp->dn_type == DMU_OT_NONE) {
+ /* this is a first alloc, not a realloc */
+ /* XXX shouldn't the phys already be zeroed? */
+ bzero(dnp, DNODE_CORE_SIZE);
+ dnp->dn_nlevels = 1;
+ }
+
+ if (dn->dn_nblkptr > dnp->dn_nblkptr) {
+ /* zero the new blkptrs we are gaining */
+ bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ sizeof (blkptr_t) *
+ (dn->dn_nblkptr - dnp->dn_nblkptr));
+ }
+ dnp->dn_type = dn->dn_type;
+ dnp->dn_bonustype = dn->dn_bonustype;
+ dnp->dn_bonuslen = dn->dn_bonuslen;
+ dnp->dn_nblkptr = dn->dn_nblkptr;
+ }
+
+ ASSERT(dnp->dn_nlevels > 1 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+ if (dn->dn_next_blksz[txgoff]) {
+ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+ SPA_MINBLOCKSIZE) == 0);
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ list_head(list) != NULL ||
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+ dnp->dn_datablkszsec);
+ dnp->dn_datablkszsec =
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] = 0;
+ }
+
+ if (dn->dn_next_indblkshift[txgoff]) {
+ ASSERT(dnp->dn_nlevels == 1);
+ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+ dn->dn_next_indblkshift[txgoff] = 0;
+ }
+
+ /*
+ * Just take the live (open-context) values for checksum and compress.
+ * Strictly speaking it's a future leak, but nothing bad happens if we
+ * start using the new checksum or compress algorithm a little early.
+ */
+ dnp->dn_checksum = dn->dn_checksum;
+ dnp->dn_compress = dn->dn_compress;
+
+ mutex_exit(&dn->dn_mtx);
+
+ /* process all the "freed" ranges in the file */
+ if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
+ for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
+ rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
+ dnode_sync_free_range(dn,
+ rp->fr_blkid, rp->fr_nblks, tx);
+ }
+ mutex_enter(&dn->dn_mtx);
+ for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+ free_range_t *last = rp;
+ rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
+ avl_remove(&dn->dn_ranges[txgoff], last);
+ kmem_free(last, sizeof (free_range_t));
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+ dnode_sync_free(dn, tx);
+ return;
+ }
+
+ if (dn->dn_next_nlevels[txgoff]) {
+ dnode_increase_indirection(dn, tx);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
+ dbuf_sync_list(list, tx);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ ASSERT3P(list_head(list), ==, NULL);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ /*
+ * Although we have dropped our reference to the dnode, it
+ * can't be evicted until its written, and we haven't yet
+ * initiated the IO for the dnode's dbuf.
+ */
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
new file mode 100644
index 000000000000..a9707a0542e1
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -0,0 +1,1889 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+
+static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
+static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
+static dsl_checkfunc_t dsl_dataset_rollback_check;
+static dsl_syncfunc_t dsl_dataset_rollback_sync;
+static dsl_checkfunc_t dsl_dataset_destroy_check;
+static dsl_syncfunc_t dsl_dataset_destroy_sync;
+
+#define DS_REF_MAX (1ULL << 62)
+
+#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+
+/*
+ * We use weighted reference counts to express the various forms of exclusion
+ * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
+ * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
+ * This makes the exclusion logic simple: the total refcnt for all opens cannot
+ * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
+ * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
+ * just over half of the refcnt space, so there can't be more than one, but it
+ * can peacefully coexist with any number of STANDARD opens.
+ */
+static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
+ 0, /* DS_MODE_NONE - invalid */
+ 1, /* DS_MODE_STANDARD - unlimited number */
+ (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */
+ DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */
+};
+
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ dprintf_bp(bp, "born, ds=%p\n", ds);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* It could have been compressed away to nothing */
+ if (BP_IS_HOLE(bp))
+ return;
+ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+ ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+ if (ds == NULL) {
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dsl_dir.
+ */
+ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ used, compressed, uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_lock);
+ ds->ds_phys->ds_used_bytes += used;
+ ds->ds_phys->ds_compressed_bytes += compressed;
+ ds->ds_phys->ds_uncompressed_bytes += uncompressed;
+ ds->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ used, compressed, uncompressed, tx);
+}
+
+void
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx)
+{
+ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* No block pointer => nothing to free */
+ if (BP_IS_HOLE(bp))
+ return;
+
+ ASSERT(used > 0);
+ if (ds == NULL) {
+ int err;
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dataset.
+ */
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
+
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ -used, -compressed, -uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+ int err;
+
+ dprintf_bp(bp, "freeing: %s", "");
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
+
+ mutex_enter(&ds->ds_lock);
+ /* XXX unique_bytes is not accurate for head datasets */
+ /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+ ds->ds_phys->ds_unique_bytes -= used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ -used, -compressed, -uncompressed, tx);
+ } else {
+ dprintf_bp(bp, "putting on dead list: %s", "");
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ ds->ds_prev->ds_phys->ds_unique_bytes +=
+ used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ }
+ }
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
+ ds->ds_phys->ds_used_bytes -= used;
+ ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
+ ds->ds_phys->ds_compressed_bytes -= compressed;
+ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
+ ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+ mutex_exit(&ds->ds_lock);
+}
+
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
+{
+ uint64_t trysnap = 0;
+
+ if (ds == NULL)
+ return (0);
+ /*
+ * The snapshot creation could fail, but that would cause an
+ * incorrect FALSE return, which would only result in an
+ * overestimation of the amount of space that an operation would
+ * consume, which is OK.
+ *
+ * There's also a small window where we could miss a pending
+ * snapshot, because we could set the sync task in the quiescing
+ * phase. So this should only be used as a guess.
+ */
+ if (ds->ds_trysnap_txg >
+ spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
+ trysnap = ds->ds_trysnap_txg;
+ return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+{
+ return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+{
+ dsl_dataset_t *ds = dsv;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* open_refcount == DS_REF_MAX when deleting */
+ ASSERT(ds->ds_open_refcount == 0 ||
+ ds->ds_open_refcount == DS_REF_MAX);
+
+ dprintf_ds(ds, "evicting %s\n", "");
+
+ unique_remove(ds->ds_phys->ds_fsid_guid);
+
+ if (ds->ds_user_ptr != NULL)
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ ds->ds_prev = NULL;
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dsl_dir_close(ds->ds_dir, ds);
+
+ if (list_link_active(&ds->ds_synced_link))
+ list_remove(&dp->dp_synced_objsets, ds);
+
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+
+ kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+static int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (ds->ds_snapname[0])
+ return (0);
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return (0);
+
+ err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err)
+ return (err);
+ headphys = headdbuf->db_data;
+ err = zap_value_search(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+ dmu_buf_rele(headdbuf, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
+ int mode, void *tag, dsl_dataset_t **dsp)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+ if (err)
+ return (err);
+ ds = dmu_buf_get_user(dbuf);
+ if (ds == NULL) {
+ dsl_dataset_t *winner;
+
+ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+ ds->ds_dbuf = dbuf;
+ ds->ds_object = dsobj;
+ ds->ds_phys = dbuf->db_data;
+
+ mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+
+ err = bplist_open(&ds->ds_deadlist,
+ mos, ds->ds_phys->ds_deadlist_obj);
+ if (err == 0) {
+ err = dsl_dir_open_obj(dp,
+ ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+ }
+ if (err) {
+ /*
+ * we don't really need to close the blist if we
+ * just opened it.
+ */
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+ ds->ds_snapname[0] = '\0';
+ if (ds->ds_phys->ds_prev_snap_obj) {
+ err = dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds, &ds->ds_prev);
+ }
+ } else {
+ if (snapname) {
+#ifdef ZFS_DEBUG
+ dsl_dataset_phys_t *headphys;
+ dmu_buf_t *headdbuf;
+ err = dmu_bonus_hold(mos,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err == 0) {
+ headphys = headdbuf->db_data;
+ uint64_t foundobj;
+ err = zap_lookup(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj,
+ snapname, sizeof (foundobj), 1,
+ &foundobj);
+ ASSERT3U(foundobj, ==, dsobj);
+ dmu_buf_rele(headdbuf, FTAG);
+ }
+#endif
+ (void) strcat(ds->ds_snapname, snapname);
+ } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+ err = dsl_dataset_get_snapname(ds);
+ }
+ }
+
+ if (err == 0) {
+ winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+ dsl_dataset_evict);
+ }
+ if (err || winner) {
+ bplist_close(&ds->ds_deadlist);
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev,
+ DS_MODE_NONE, ds);
+ }
+ dsl_dir_close(ds->ds_dir, ds);
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ if (err) {
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ ds = winner;
+ } else {
+ uint64_t new =
+ unique_insert(ds->ds_phys->ds_fsid_guid);
+ if (new != ds->ds_phys->ds_fsid_guid) {
+ /* XXX it won't necessarily be synced... */
+ ds->ds_phys->ds_fsid_guid = new;
+ }
+ }
+ }
+ ASSERT3P(ds->ds_dbuf, ==, dbuf);
+ ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+
+ mutex_enter(&ds->ds_lock);
+ if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
+ (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
+ !DS_MODE_IS_INCONSISTENT(mode)) ||
+ (ds->ds_open_refcount + weight > DS_REF_MAX)) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ return (EBUSY);
+ }
+ ds->ds_open_refcount += weight;
+ mutex_exit(&ds->ds_lock);
+
+ *dsp = ds;
+ return (0);
+}
+
+int
+dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ const char *tail;
+ uint64_t obj;
+ dsl_dataset_t *ds = NULL;
+ int err = 0;
+
+ err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+
+ dp = dd->dd_pool;
+ obj = dd->dd_phys->dd_head_dataset_obj;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if (obj == 0) {
+ /* A dataset with no associated objset */
+ err = ENOENT;
+ goto out;
+ }
+
+ if (tail != NULL) {
+ objset_t *mos = dp->dp_meta_objset;
+
+ err = dsl_dataset_open_obj(dp, obj, NULL,
+ DS_MODE_NONE, tag, &ds);
+ if (err)
+ goto out;
+ obj = ds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ ds = NULL;
+
+ if (tail[0] != '@') {
+ err = ENOENT;
+ goto out;
+ }
+ tail++;
+
+ /* Look for a snapshot */
+ if (!DS_MODE_IS_READONLY(mode)) {
+ err = EROFS;
+ goto out;
+ }
+ dprintf("looking for snapshot '%s'\n", tail);
+ err = zap_lookup(mos, obj, tail, 8, 1, &obj);
+ if (err)
+ goto out;
+ }
+ err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dir_close(dd, FTAG);
+
+ ASSERT3U((err == 0), ==, (ds != NULL));
+ /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
+
+ *dsp = ds;
+ return (err);
+}
+
+int
+dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+ if (ds == NULL) {
+ (void) strcpy(name, "mos");
+ } else {
+ dsl_dir_name(ds->ds_dir, name);
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ if (ds->ds_snapname[0]) {
+ (void) strcat(name, "@");
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
+ mutex_enter(&ds->ds_lock);
+ (void) strcat(name, ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ (void) strcat(name, ds->ds_snapname);
+ }
+ }
+ }
+}
+
+void
+dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_open_refcount, >=, weight);
+ ds->ds_open_refcount -= weight;
+ dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
+ mode, ds->ds_open_refcount);
+ mutex_exit(&ds->ds_lock);
+
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ dsl_dir_t *dd;
+
+ dsl_dir_create_root(mos, ddobjp, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ VERIFY(0 ==
+ dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
+ (void) dmu_objset_create_impl(dp->dp_spa, ds,
+ &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd,
+ const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = pdd->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj, ddobj;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dir_t *dd;
+
+ ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
+ ASSERT(clone_parent == NULL ||
+ clone_parent->ds_phys->ds_num_children > 0);
+ ASSERT(lastname[0] != '@');
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ if (clone_parent) {
+ dsphys->ds_prev_snap_obj = clone_parent->ds_object;
+ dsphys->ds_prev_snap_txg =
+ clone_parent->ds_phys->ds_creation_txg;
+ dsphys->ds_used_bytes =
+ clone_parent->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes =
+ clone_parent->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ clone_parent->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+
+ dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
+ clone_parent->ds_phys->ds_num_children++;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
+ }
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ return (dsobj);
+}
+
+struct destroyarg {
+ dsl_sync_task_group_t *dstg;
+ char *snapname;
+ void *tag;
+ char *failed;
+};
+
+static int
+dsl_snapshot_destroy_one(char *name, void *arg)
+{
+ struct destroyarg *da = arg;
+ dsl_dataset_t *ds;
+ char *cp;
+ int err;
+
+ (void) strcat(name, "@");
+ (void) strcat(name, da->snapname);
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ da->tag, &ds);
+ cp = strchr(name, '@');
+ *cp = '\0';
+ if (err == ENOENT)
+ return (0);
+ if (err) {
+ (void) strcpy(da->failed, name);
+ return (err);
+ }
+
+ dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, da->tag, 0);
+ return (0);
+}
+
+/*
+ * Destroy 'snapname' in all descendants of 'fsname'.
+ */
+#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
+int
+dsl_snapshots_destroy(char *fsname, char *snapname)
+{
+ int err;
+ struct destroyarg da;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ char *cp;
+
+ cp = strchr(fsname, '/');
+ if (cp) {
+ *cp = '\0';
+ err = spa_open(fsname, &spa, FTAG);
+ *cp = '/';
+ } else {
+ err = spa_open(fsname, &spa, FTAG);
+ }
+ if (err)
+ return (err);
+ da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ da.snapname = snapname;
+ da.tag = FTAG;
+ da.failed = fsname;
+
+ err = dmu_objset_find(fsname,
+ dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
+
+ if (err == 0)
+ err = dsl_sync_task_group_wait(da.dstg);
+
+ for (dst = list_head(&da.dstg->dstg_tasks); dst;
+ dst = list_next(&da.dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+ if (dst->dst_err) {
+ dsl_dataset_name(ds, fsname);
+ cp = strchr(fsname, '@');
+ *cp = '\0';
+ }
+ /*
+ * If it was successful, destroy_sync would have
+ * closed the ds
+ */
+ if (err)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ }
+
+ dsl_sync_task_group_destroy(da.dstg);
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_destroy(const char *name)
+{
+ int err;
+ dsl_sync_task_group_t *dstg;
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ uint64_t obj;
+
+ if (strchr(name, '@')) {
+ /* Destroying a snapshot is simpler */
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ FTAG, &ds);
+ if (err)
+ return (err);
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
+ ds, FTAG, 0);
+ if (err)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (err);
+ }
+
+ err = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
+ if (err)
+ return (err);
+ ds = os->os->os_dsl_dataset;
+ dd = ds->ds_dir;
+
+ /*
+ * Check for errors and mark this ds as inconsistent, in
+ * case we crash while freeing the objects.
+ */
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
+ dsl_dataset_destroy_begin_sync, ds, NULL, 0);
+ if (err) {
+ dmu_objset_close(os);
+ return (err);
+ }
+
+ /*
+ * remove the objects in open context, so that we won't
+ * have too much to do in syncing context.
+ */
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
+ ds->ds_phys->ds_prev_snap_txg)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
+ dmu_tx_hold_bonus(tx, obj);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ /*
+ * Perhaps there is not enough disk
+ * space. Just deal with it from
+ * dsl_dataset_destroy_sync().
+ */
+ dmu_tx_abort(tx);
+ continue;
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ dmu_tx_commit(tx);
+ }
+ /* Make sure it's not dirty before we finish destroying it. */
+ txg_wait_synced(dd->dd_pool, 0);
+
+ dmu_objset_close(os);
+ if (err != ESRCH)
+ return (err);
+
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (err);
+ }
+
+ /*
+ * Blow away the dsl_dir + head dataset.
+ */
+ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+ dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, FTAG, 0);
+ dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+ dsl_dir_destroy_sync, dd, FTAG, 0);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+ /* if it is successful, *destroy_sync will close the ds+dd */
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ dsl_dir_close(dd, FTAG);
+ }
+ return (err);
+}
+
+int
+dsl_dataset_rollback(dsl_dataset_t *ds)
+{
+ ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ return (dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
+ ds, NULL, 0));
+}
+
+void *
+dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func)
+{
+ void *old;
+
+ mutex_enter(&ds->ds_lock);
+ old = ds->ds_user_ptr;
+ if (old == NULL) {
+ ds->ds_user_ptr = p;
+ ds->ds_user_evict_func = func;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (old);
+}
+
+void *
+dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
+{
+ return (ds->ds_user_ptr);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+ return (&ds->ds_phys->ds_bp);
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* If it's the meta-objset, set dp_meta_rootbp */
+ if (ds == NULL) {
+ tx->tx_pool->dp_meta_rootbp = *bp;
+ } else {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_bp = *bp;
+ }
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp;
+
+ if (ds == NULL) /* this is the meta-objset */
+ return;
+
+ ASSERT(ds->ds_user_ptr != NULL);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ panic("dirtying snapshot!");
+
+ dp = ds->ds_dir->dd_pool;
+
+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ }
+}
+
+struct killarg {
+ uint64_t *usedp;
+ uint64_t *compressedp;
+ uint64_t *uncompressedp;
+ zio_t *zio;
+ dmu_tx_t *tx;
+};
+
+static int
+kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct killarg *ka = arg;
+ blkptr_t *bp = &bc->bc_blkptr;
+
+ ASSERT3U(bc->bc_errno, ==, 0);
+
+ /*
+ * Since this callback is not called concurrently, no lock is
+ * needed on the accounting values.
+ */
+ *ka->usedp += bp_get_dasize(spa, bp);
+ *ka->compressedp += BP_GET_PSIZE(bp);
+ *ka->uncompressedp += BP_GET_UCSIZE(bp);
+ /* XXX check for EIO? */
+ (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
+ ARC_NOWAIT);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /*
+ * There must be a previous snapshot. I suppose we could roll
+ * it back to being empty (and re-initialize the upper (ZPL)
+ * layer). But for now there's no way to do this via the user
+ * interface.
+ */
+ if (ds->ds_phys->ds_prev_snap_txg == 0)
+ return (EINVAL);
+
+ /*
+ * This must not be a snapshot.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ return (EINVAL);
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
+ return (EAGAIN);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ /* Zero out the deadlist. */
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
+
+ {
+ /* Free blkptrs that we gave birth to */
+ zio_t *zio;
+ uint64_t used = 0, compressed = 0, uncompressed = 0;
+ struct killarg ka;
+
+ zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
+ ZIO_FLAG_MUSTSUCCEED);
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ (void) zio_wait(zio);
+
+ dsl_dir_diduse_space(ds->ds_dir,
+ -used, -compressed, -uncompressed, tx);
+ }
+
+ /* Change our contents to that of the prev snapshot */
+ ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
+ ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+ ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
+ ds->ds_phys->ds_compressed_bytes =
+ ds->ds_prev->ds_phys->ds_compressed_bytes;
+ ds->ds_phys->ds_uncompressed_bytes =
+ ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+ ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+ ds->ds_phys->ds_unique_bytes = 0;
+
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ return (EINVAL);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /* Mark it as inconsistent on-disk, in case we crash */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /* Can't delete a branch point. */
+ if (ds->ds_phys->ds_num_children > 1)
+ return (EEXIST);
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ return (EINVAL);
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
+ return (EAGAIN);
+
+ /* XXX we should do some i/o error checking... */
+ return (0);
+}
+
+static void
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t used = 0, compressed = 0, uncompressed = 0;
+ zio_t *zio;
+ int err;
+ int after_branch_point = FALSE;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds_prev = NULL;
+ uint64_t obj;
+
+ ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT(ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ obj = ds->ds_object;
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (ds->ds_prev) {
+ ds_prev = ds->ds_prev;
+ } else {
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_prev));
+ }
+ after_branch_point =
+ (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ if (after_branch_point &&
+ ds->ds_phys->ds_next_snap_obj == 0) {
+ /* This clone is toast. */
+ ASSERT(ds_prev->ds_phys->ds_num_children > 1);
+ ds_prev->ds_phys->ds_num_children--;
+ } else if (!after_branch_point) {
+ ds_prev->ds_phys->ds_next_snap_obj =
+ ds->ds_phys->ds_next_snap_obj;
+ }
+ }
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ blkptr_t bp;
+ dsl_dataset_t *ds_next;
+ uint64_t itor = 0;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_next));
+ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+ ds_next->ds_phys->ds_prev_snap_obj =
+ ds->ds_phys->ds_prev_snap_obj;
+ ds_next->ds_phys->ds_prev_snap_txg =
+ ds->ds_phys->ds_prev_snap_txg;
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+ /*
+ * Transfer to our deadlist (which will become next's
+ * new deadlist) any entries from next's current
+ * deadlist which were born before prev, and free the
+ * other entries.
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ while (bplist_iterate(&ds_next->ds_deadlist, &itor,
+ &bp) == 0) {
+ if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
+ &bp, tx));
+ if (ds_prev && !after_branch_point &&
+ bp.blk_birth >
+ ds_prev->ds_phys->ds_prev_snap_txg) {
+ ds_prev->ds_phys->ds_unique_bytes +=
+ bp_get_dasize(dp->dp_spa, &bp);
+ }
+ } else {
+ used += bp_get_dasize(dp->dp_spa, &bp);
+ compressed += BP_GET_PSIZE(&bp);
+ uncompressed += BP_GET_UCSIZE(&bp);
+ /* XXX check return value? */
+ (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+ &bp, NULL, NULL, ARC_NOWAIT);
+ }
+ }
+
+ /* free next's deadlist */
+ bplist_close(&ds_next->ds_deadlist);
+ bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+
+ /* set next's deadlist to our deadlist */
+ ds_next->ds_phys->ds_deadlist_obj =
+ ds->ds_phys->ds_deadlist_obj;
+ VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj));
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+ /*
+ * Update next's unique to include blocks which
+ * were previously shared by only this snapshot
+ * and it. Those blocks will be born after the
+ * prev snap and before this snap, and will have
+ * died after the next snap and before the one
+ * after that (ie. be on the snap after next's
+ * deadlist).
+ *
+ * XXX we're doing this long task with the
+ * config lock held
+ */
+ dsl_dataset_t *ds_after_next;
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds_next->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_after_next));
+ itor = 0;
+ while (bplist_iterate(&ds_after_next->ds_deadlist,
+ &itor, &bp) == 0) {
+ if (bp.blk_birth >
+ ds->ds_phys->ds_prev_snap_txg &&
+ bp.blk_birth <=
+ ds->ds_phys->ds_creation_txg) {
+ ds_next->ds_phys->ds_unique_bytes +=
+ bp_get_dasize(dp->dp_spa, &bp);
+ }
+ }
+
+ dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+ ASSERT3P(ds_next->ds_prev, ==, NULL);
+ } else {
+ /*
+ * It would be nice to update the head dataset's
+ * unique. To do so we would have to traverse
+ * it for blocks born after ds_prev, which is
+ * pretty expensive just to maintain something
+ * for debugging purposes.
+ */
+ ASSERT3P(ds_next->ds_prev, ==, ds);
+ dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
+ ds_next);
+ if (ds_prev) {
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds_next, &ds_next->ds_prev));
+ } else {
+ ds_next->ds_prev = NULL;
+ }
+ }
+ dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
+
+ /*
+ * NB: unique_bytes is not accurate for head objsets
+ * because we don't update it when we delete the most
+ * recent snapshot -- see above comment.
+ */
+ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ } else {
+ /*
+ * There's no next snapshot, so this is a head dataset.
+ * Destroy the deadlist. Unless it's a clone, the
+ * deadlist should be empty. (If it's a clone, it's
+ * safe to ignore the deadlist contents.)
+ */
+ struct killarg ka;
+
+ ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ /*
+ * Free everything that we point to (that's born after
+ * the previous snapshot, if we are a clone)
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ ASSERT3U(err, ==, 0);
+ }
+
+ err = zio_wait(zio);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
+
+ if (ds->ds_phys->ds_snapnames_zapobj) {
+ err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+ ASSERT(err == 0);
+ }
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
+ /* Erase the link in the dataset */
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+ /*
+ * dsl_dir_sync_destroy() called us, they'll destroy
+ * the dataset.
+ */
+ } else {
+ /* remove from snapshot namespace */
+ dsl_dataset_t *ds_head;
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_head));
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+ {
+ uint64_t val;
+ err = zap_lookup(mos,
+ ds_head->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &val);
+ ASSERT3U(err, ==, 0);
+ ASSERT3U(val, ==, obj);
+ }
+#endif
+ err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, tx);
+ ASSERT(err == 0);
+ dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+ }
+
+ if (ds_prev && ds->ds_prev != ds_prev)
+ dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+
+ spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
+ VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ const char *snapname = arg2;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
+ uint64_t value;
+
+ /*
+ * We don't allow multiple snapshots of the same txg. If there
+ * is already one, try again.
+ */
+ if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+ return (EAGAIN);
+
+ /*
+ * Check for conflicting name snapshot name.
+ */
+ err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &value);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ ds->ds_trysnap_txg = tx->tx_txg;
+ return (0);
+}
+
+void
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ const char *snapname = arg2;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ dsphys->ds_next_snap_obj = ds->ds_object;
+ dsphys->ds_num_children = 1;
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+ dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_flags = ds->ds_phys->ds_flags;
+ dsphys->ds_bp = ds->ds_phys->ds_bp;
+ dmu_buf_rele(dbuf, FTAG);
+
+ ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+ if (ds->ds_prev) {
+ ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object ||
+ ds->ds_prev->ds_phys->ds_num_children > 1);
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds->ds_prev->ds_phys->ds_creation_txg);
+ ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+ }
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+ ds->ds_phys->ds_prev_snap_obj = dsobj;
+ ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+ ds->ds_phys->ds_unique_bytes = 0;
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
+
+ dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+ err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &dsobj, tx);
+ ASSERT(err == 0);
+
+ if (ds->ds_prev)
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, snapname,
+ DS_MODE_NONE, ds, &ds->ds_prev));
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+ dsl_dir_dirty(ds->ds_dir, tx);
+ dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+ /* Unneeded? bplist_close(&ds->ds_deadlist); */
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ dsl_dir_stats(ds->ds_dir, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+ ds->ds_phys->ds_creation_time);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+ ds->ds_phys->ds_creation_txg);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
+ ds->ds_phys->ds_used_bytes);
+
+ if (ds->ds_phys->ds_next_snap_obj) {
+ /*
+ * This is a snapshot; override the dd's space used with
+ * our unique space and compression ratio.
+ */
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+ ds->ds_phys->ds_unique_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
+ (ds->ds_phys->ds_uncompressed_bytes * 100 /
+ ds->ds_phys->ds_compressed_bytes));
+ }
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+ stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
+ stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
+ if (ds->ds_phys->ds_next_snap_obj) {
+ stat->dds_is_snapshot = B_TRUE;
+ stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+ }
+
+ /* clone origin is really a dsl_dir thing... */
+ if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
+ dsl_dataset_t *ods;
+
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_NONE, FTAG, &ods));
+ dsl_dataset_name(ods, stat->dds_clone_of);
+ dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ }
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+ return (ds->ds_phys->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ *refdbytesp = ds->ds_phys->ds_used_bytes;
+ *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+ *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+ *availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *newsnapname = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *hds;
+ uint64_t val;
+ int err;
+
+ err = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
+ if (err)
+ return (err);
+
+ /* new name better not be in use */
+ err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
+ newsnapname, 8, 1, &val);
+ dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+
+ if (err == 0)
+ err = EEXIST;
+ else if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+static void
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *newsnapname = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *hds;
+ int err;
+
+ ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
+
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, tx);
+ ASSERT3U(err, ==, 0);
+ mutex_enter(&ds->ds_lock);
+ (void) strcpy(ds->ds_snapname, newsnapname);
+ mutex_exit(&ds->ds_lock);
+ err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &ds->ds_object, tx);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+}
+
+#pragma weak dmu_objset_rename = dsl_dataset_rename
+int
+dsl_dataset_rename(const char *oldname, const char *newname)
+{
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ const char *tail;
+ int err;
+
+ err = dsl_dir_open(oldname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ err = dsl_dir_rename(dd, newname);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+ if (tail[0] != '@') {
+ /* the name ended in a nonexistant component */
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ dsl_dir_close(dd, FTAG);
+
+ /* new name must be snapshot in same filesystem */
+ tail = strchr(newname, '@');
+ if (tail == NULL)
+ return (EINVAL);
+ tail++;
+ if (strncmp(oldname, newname, tail - newname) != 0)
+ return (EXDEV);
+
+ err = dsl_dataset_open(oldname,
+ DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_snapshot_rename_check,
+ dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+
+ return (err);
+}
+
+struct promotearg {
+ uint64_t used, comp, uncomp, unique;
+ uint64_t newnext_obj, snapnames_obj;
+};
+
+static int
+dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *hds = arg1;
+ struct promotearg *pa = arg2;
+ dsl_dir_t *dd = hds->ds_dir;
+ dsl_pool_t *dp = hds->ds_dir->dd_pool;
+ dsl_dir_t *pdd = NULL;
+ dsl_dataset_t *ds = NULL;
+ dsl_dataset_t *pivot_ds = NULL;
+ dsl_dataset_t *newnext_ds = NULL;
+ int err;
+ char *name = NULL;
+ uint64_t itor = 0;
+ blkptr_t bp;
+
+ bzero(pa, sizeof (*pa));
+
+ /* Check that it is a clone */
+ if (dd->dd_phys->dd_clone_parent_obj == 0)
+ return (EINVAL);
+
+ /* Since this is so expensive, don't do the preliminary check */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ if (err = dsl_dataset_open_obj(dp,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
+ goto out;
+ pdd = pivot_ds->ds_dir;
+
+ {
+ dsl_dataset_t *phds;
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ pdd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_NONE, FTAG, &phds))
+ goto out;
+ pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
+ }
+
+ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+ err = EXDEV;
+ goto out;
+ }
+
+ /* find pivot point's new next ds */
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
+ NULL, DS_MODE_NONE, FTAG, &newnext_ds));
+ while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
+ dsl_dataset_t *prev;
+
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ newnext_ds->ds_phys->ds_prev_snap_obj,
+ NULL, DS_MODE_NONE, FTAG, &prev))
+ goto out;
+ dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
+ newnext_ds = prev;
+ }
+ pa->newnext_obj = newnext_ds->ds_object;
+
+ /* compute pivot point's new unique space */
+ while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
+ &itor, &bp)) == 0) {
+ if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
+ pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
+ }
+ if (err != ENOENT)
+ goto out;
+
+ /* Walk the snapshots that we are moving */
+ name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ ds = pivot_ds;
+ /* CONSTCOND */
+ while (TRUE) {
+ uint64_t val, dlused, dlcomp, dluncomp;
+ dsl_dataset_t *prev;
+
+ /* Check that the snapshot name does not conflict */
+ dsl_dataset_name(ds, name);
+ err = zap_lookup(dd->dd_pool->dp_meta_objset,
+ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &val);
+ if (err != ENOENT) {
+ if (err == 0)
+ err = EEXIST;
+ goto out;
+ }
+
+ /*
+ * compute space to transfer. Each snapshot gave birth to:
+ * (my used) - (prev's used) + (deadlist's used)
+ */
+ pa->used += ds->ds_phys->ds_used_bytes;
+ pa->comp += ds->ds_phys->ds_compressed_bytes;
+ pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
+
+ /* If we reach the first snapshot, we're done. */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ break;
+
+ if (err = bplist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp))
+ goto out;
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &prev))
+ goto out;
+ pa->used += dlused - prev->ds_phys->ds_used_bytes;
+ pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
+ pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
+
+ /*
+ * We could be a clone of a clone. If we reach our
+ * parent's branch point, we're done.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
+ break;
+ }
+ if (ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ ds = prev;
+ }
+
+ /* Check that there is enough space here */
+ err = dsl_dir_transfer_possible(pdd, dd, pa->used);
+
+out:
+ if (ds && ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (pivot_ds)
+ dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (newnext_ds)
+ dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
+ if (name)
+ kmem_free(name, MAXPATHLEN);
+ return (err);
+}
+
+static void
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *hds = arg1;
+ struct promotearg *pa = arg2;
+ dsl_dir_t *dd = hds->ds_dir;
+ dsl_pool_t *dp = hds->ds_dir->dd_pool;
+ dsl_dir_t *pdd = NULL;
+ dsl_dataset_t *ds, *pivot_ds;
+ char *name;
+
+ ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
+ ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
+ /*
+ * We need to explicitly open pdd, since pivot_ds's pdd will be
+ * changing.
+ */
+ VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
+ NULL, FTAG, &pdd));
+
+ /* move snapshots to this dir */
+ name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ ds = pivot_ds;
+ /* CONSTCOND */
+ while (TRUE) {
+ dsl_dataset_t *prev;
+
+ /* move snap name entry */
+ dsl_dataset_name(ds, name);
+ VERIFY(0 == zap_remove(dp->dp_meta_objset,
+ pa->snapnames_obj, ds->ds_snapname, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset,
+ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &ds->ds_object, tx));
+
+ /* change containing dsl_dir */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
+ ds->ds_phys->ds_dir_obj = dd->dd_object;
+ ASSERT3P(ds->ds_dir, ==, pdd);
+ dsl_dir_close(ds->ds_dir, ds);
+ VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
+ NULL, ds, &ds->ds_dir));
+
+ ASSERT3U(dsl_prop_numcb(ds), ==, 0);
+
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ break;
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &prev));
+
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
+ break;
+ }
+ if (ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ ds = prev;
+ }
+ if (ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+
+ /* change pivot point's next snap */
+ dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
+ pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
+
+ /* change clone_parent-age */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
+ dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
+ dmu_buf_will_dirty(pdd->dd_dbuf, tx);
+ pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
+
+ /* change space accounting */
+ dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
+ dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
+ pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
+
+ dsl_dir_close(pdd, FTAG);
+ dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
+ kmem_free(name, MAXPATHLEN);
+}
+
+int
+dsl_dataset_promote(const char *name)
+{
+ dsl_dataset_t *ds;
+ int err;
+ dmu_object_info_t doi;
+ struct promotearg pa;
+
+ err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, &doi);
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (err);
+ }
+
+ /*
+ * Add in 128x the snapnames zapobj size, since we will be moving
+ * a bunch of snapnames to the promoted ds, and dirtying their
+ * bonus buffers.
+ */
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (err);
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+ int error;
+
+ if ((error = spa_open(pname, &spa, FTAG)) != 0)
+ return (error);
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if ((error = dsl_dataset_open_obj(dp, obj,
+ NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
new file mode 100644
index 000000000000..97779a2f0468
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -0,0 +1,1192 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include "zfs_namecheck.h"
+
+static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
+
+
+/* ARGSUSED */
+static void
+dsl_dir_evict(dmu_buf_t *db, void *arg)
+{
+ dsl_dir_t *dd = arg;
+ dsl_pool_t *dp = dd->dd_pool;
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ ASSERT(dd->dd_tempreserved[t] == 0);
+ ASSERT(dd->dd_space_towrite[t] == 0);
+ }
+
+ ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
+
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+
+ spa_close(dd->dd_pool->dp_spa, dd);
+
+ /*
+ * The props callback list should be empty since they hold the
+ * dir open.
+ */
+ list_destroy(&dd->dd_prop_cbs);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+int
+dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **ddp)
+{
+ dmu_buf_t *dbuf;
+ dsl_dir_t *dd;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+ if (err)
+ return (err);
+ dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+ }
+#endif
+ /* XXX assert bonus buffer size is correct */
+ if (dd == NULL) {
+ dsl_dir_t *winner;
+ int err;
+
+ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+ dd->dd_object = ddobj;
+ dd->dd_dbuf = dbuf;
+ dd->dd_pool = dp;
+ dd->dd_phys = dbuf->db_data;
+ dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
+ mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_node));
+
+ if (dd->dd_phys->dd_parent_obj) {
+ err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+ NULL, dd, &dd->dd_parent);
+ if (err) {
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ if (tail) {
+#ifdef ZFS_DEBUG
+ uint64_t foundobj;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->
+ dd_child_dir_zapobj,
+ tail, sizeof (foundobj), 1, &foundobj);
+ ASSERT(err || foundobj == ddobj);
+#endif
+ (void) strcpy(dd->dd_myname, tail);
+ } else {
+ err = zap_value_search(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->
+ dd_child_dir_zapobj,
+ ddobj, dd->dd_myname);
+ }
+ if (err) {
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ } else {
+ (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+ }
+
+ winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
+ dsl_dir_evict);
+ if (winner) {
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dd = winner;
+ } else {
+ spa_open_ref(dp->dp_spa, dd);
+ }
+ }
+
+ /*
+ * The dsl_dir_t has both open-to-close and instantiate-to-evict
+ * holds on the spa. We need the open-to-close holds because
+ * otherwise the spa_refcnt wouldn't change when we open a
+ * dir which the spa also has open, so we could incorrectly
+ * think it was OK to unload/export/destroy the pool. We need
+ * the instantiate-to-evict hold because the dsl_dir_t has a
+ * pointer to the dd_pool, which has a pointer to the spa_t.
+ */
+ spa_open_ref(dp->dp_spa, tag);
+ ASSERT3P(dd->dd_pool, ==, dp);
+ ASSERT3U(dd->dd_object, ==, ddobj);
+ ASSERT3P(dd->dd_dbuf, ==, dbuf);
+ *ddp = dd;
+ return (0);
+}
+
+void
+dsl_dir_close(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+ if (dd->dd_parent) {
+ dsl_dir_name(dd->dd_parent, buf);
+ (void) strcat(buf, "/");
+ } else {
+ buf[0] = '\0';
+ }
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /*
+ * recursive mutex so that we can use
+ * dprintf_dd() with dd_lock held
+ */
+ mutex_enter(&dd->dd_lock);
+ (void) strcat(buf, dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ (void) strcat(buf, dd->dd_myname);
+ }
+}
+
+int
+dsl_dir_is_private(dsl_dir_t *dd)
+{
+ int rv = FALSE;
+
+ if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
+ rv = TRUE;
+ if (dataset_name_hidden(dd->dd_myname))
+ rv = TRUE;
+ return (rv);
+}
+
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+ char *p;
+ if (path == NULL)
+ return (ENOENT);
+ /* This would be a good place to reserve some namespace... */
+ p = strpbrk(path, "/@");
+ if (p && (p[1] == '/' || p[1] == '@')) {
+ /* two separators in a row */
+ return (EINVAL);
+ }
+ if (p == NULL || p == path) {
+ /*
+ * if the first thing is an @ or /, it had better be an
+ * @ and it had better not have any more ats or slashes,
+ * and it had better have something after the @.
+ */
+ if (p != NULL &&
+ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+ return (EINVAL);
+ if (strlen(path) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(component, path);
+ p = NULL;
+ } else if (p[0] == '/') {
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ p++;
+ } else if (p[0] == '@') {
+ /*
+ * if the next separator is an @, there better not be
+ * any more slashes.
+ */
+ if (strchr(path, '/'))
+ return (EINVAL);
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ } else {
+ ASSERT(!"invalid p");
+ }
+ *nextp = p;
+ return (0);
+}
+
+/*
+ * same as dsl_open_dir, ignore the first component of name and use the
+ * spa instead
+ */
+int
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+ dsl_dir_t **ddp, const char **tailp)
+{
+ char buf[MAXNAMELEN];
+ const char *next, *nextnext = NULL;
+ int err;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ uint64_t ddobj;
+ int openedspa = FALSE;
+
+ dprintf("%s\n", name);
+
+ err = getcomponent(name, buf, &next);
+ if (err)
+ return (err);
+ if (spa == NULL) {
+ err = spa_open(buf, &spa, FTAG);
+ if (err) {
+ dprintf("spa_open(%s) failed\n", buf);
+ return (err);
+ }
+ openedspa = TRUE;
+
+ /* XXX this assertion belongs in spa_open */
+ ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+ }
+
+ dp = spa_get_dsl(spa);
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+ if (err) {
+ rw_exit(&dp->dp_config_rwlock);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ while (next != NULL) {
+ dsl_dir_t *child_ds;
+ err = getcomponent(next, buf, &nextnext);
+ if (err)
+ break;
+ ASSERT(next[0] != '\0');
+ if (next[0] == '@')
+ break;
+ dprintf("looking up %s in obj%lld\n",
+ buf, dd->dd_phys->dd_child_dir_zapobj);
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj,
+ buf, sizeof (ddobj), 1, &ddobj);
+ if (err) {
+ if (err == ENOENT)
+ err = 0;
+ break;
+ }
+
+ err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
+ if (err)
+ break;
+ dsl_dir_close(dd, tag);
+ dd = child_ds;
+ next = nextnext;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ if (err) {
+ dsl_dir_close(dd, tag);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ /*
+ * It's an error if there's more than one component left, or
+ * tailp==NULL and there's any component left.
+ */
+ if (next != NULL &&
+ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+ /* bad path name */
+ dsl_dir_close(dd, tag);
+ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+ err = ENOENT;
+ }
+ if (tailp)
+ *tailp = next;
+ if (openedspa)
+ spa_close(spa, FTAG);
+ *ddp = dd;
+ return (err);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail. Return NULL if the path is bogus, or if
+ * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
+ * means that the last component is a snapshot.
+ */
+int
+dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
+{
+ return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
+}
+
+uint64_t
+dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = pds->dd_pool->dp_meta_objset;
+ uint64_t ddobj;
+ dsl_dir_phys_t *dsphys;
+ dmu_buf_t *dbuf;
+
+ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+ VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx));
+ VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+
+ dsphys->dd_creation_time = gethrestime_sec();
+ dsphys->dd_parent_obj = pds->dd_object;
+ dsphys->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsphys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ dmu_buf_rele(dbuf, FTAG);
+
+ return (ddobj);
+}
+
+/* ARGSUSED */
+int
+dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+ uint64_t count;
+
+ /*
+ * There should be exactly two holds, both from
+ * dsl_dataset_destroy: one on the dd directory, and one on its
+ * head ds. Otherwise, someone is trying to lookup something
+ * inside this dir while we want to destroy it. The
+ * config_rwlock ensures that nobody else opens it after we
+ * check.
+ */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ return (EBUSY);
+
+ err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
+ if (err)
+ return (err);
+ if (count != 0)
+ return (EEXIST);
+
+ return (0);
+}
+
+void
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t val, obj;
+
+ ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+ /* Remove our reservation. */
+ val = 0;
+ dsl_dir_set_reservation_sync(dd, &val, tx);
+ ASSERT3U(dd->dd_used_bytes, ==, 0);
+ ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+
+ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
+ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+ VERIFY(0 == zap_remove(mos,
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+
+ obj = dd->dd_object;
+ dsl_dir_close(dd, tag);
+ VERIFY(0 == dmu_object_free(mos, obj, tx));
+}
+
+void
+dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ dsl_dir_phys_t *dsp;
+ dmu_buf_t *dbuf;
+ int error;
+
+ *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+
+ error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
+ sizeof (uint64_t), 1, ddobjp, tx);
+ ASSERT3U(error, ==, 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsp = dbuf->db_data;
+
+ dsp->dd_creation_time = gethrestime_sec();
+ dsp->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsp->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+
+ dmu_buf_rele(dbuf, FTAG);
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
+{
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
+ dsl_dir_space_available(dd, NULL, 0, TRUE));
+
+ mutex_enter(&dd->dd_lock);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+ dd->dd_phys->dd_quota);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
+ dd->dd_phys->dd_reserved);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
+ (dd->dd_phys->dd_uncompressed_bytes * 100 /
+ dd->dd_phys->dd_compressed_bytes));
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_phys->dd_clone_parent_obj) {
+ dsl_dataset_t *ds;
+ char buf[MAXNAMELEN];
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_NONE, FTAG, &ds));
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
+ }
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+
+ ASSERT(dd->dd_phys);
+
+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(dd->dd_dbuf, dd);
+ }
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+ uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
+ uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+ return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+ dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
+ mutex_exit(&dd->dd_lock);
+
+ /* release the hold from dsl_dir_dirty */
+ dmu_buf_rele(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_estimated_space(dsl_dir_t *dd)
+{
+ int64_t space;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ space = dd->dd_phys->dd_used_bytes;
+ ASSERT(space >= 0);
+ for (i = 0; i < TXG_SIZE; i++) {
+ space += dd->dd_space_towrite[i&TXG_MASK];
+ ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
+ }
+ return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it? If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+ uint64_t parentspace, myspace, quota, used;
+
+ /*
+ * If there are no restrictions otherwise, assume we have
+ * unlimited space available.
+ */
+ quota = UINT64_MAX;
+ parentspace = UINT64_MAX;
+
+ if (dd->dd_parent != NULL) {
+ parentspace = dsl_dir_space_available(dd->dd_parent,
+ ancestor, delta, ondiskonly);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ if (dd->dd_phys->dd_quota != 0)
+ quota = dd->dd_phys->dd_quota;
+ if (ondiskonly) {
+ used = dd->dd_used_bytes;
+ } else {
+ used = dsl_dir_estimated_space(dd);
+ }
+ if (dd == ancestor)
+ used += delta;
+
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+ quota = MIN(quota, poolsize);
+ }
+
+ if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+ /*
+ * We have some space reserved, in addition to what our
+ * parent gave us.
+ */
+ parentspace += dd->dd_phys->dd_reserved - used;
+ }
+
+ if (used > quota) {
+ /* over quota */
+ myspace = 0;
+
+ /*
+ * While it's OK to be a little over quota, if
+ * we think we are using more space than there
+ * is in the pool (which is already 1.6% more than
+ * dsl_pool_adjustedsize()), something is very
+ * wrong.
+ */
+ ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
+ } else {
+ /*
+ * the lesser of the space provided by our parent and
+ * the space left in our quota
+ */
+ myspace = MIN(parentspace, quota - used);
+ }
+
+ mutex_exit(&dd->dd_lock);
+
+ return (myspace);
+}
+
+struct tempreserve {
+ list_node_t tr_node;
+ dsl_dir_t *tr_ds;
+ uint64_t tr_size;
+};
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd,
+ uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+{
+ uint64_t txg = tx->tx_txg;
+ uint64_t est_used, quota, parent_rsrv;
+ int edquot = EDQUOT;
+ int txgidx = txg & TXG_MASK;
+ int i;
+ struct tempreserve *tr;
+
+ ASSERT3U(txg, !=, 0);
+ ASSERT3S(asize, >=, 0);
+
+ mutex_enter(&dd->dd_lock);
+ /*
+ * Check against the dsl_dir's quota. We don't add in the delta
+ * when checking for over-quota because they get one free hit.
+ */
+ est_used = dsl_dir_estimated_space(dd);
+ for (i = 0; i < TXG_SIZE; i++)
+ est_used += dd->dd_tempreserved[i];
+
+ quota = UINT64_MAX;
+
+ if (dd->dd_phys->dd_quota)
+ quota = dd->dd_phys->dd_quota;
+
+ /*
+ * If this transaction will result in a net free of space, we want
+ * to let it through, but we have to be careful: the space that it
+ * frees won't become available until *after* this txg syncs.
+ * Therefore, to ensure that it's possible to remove files from
+ * a full pool without inducing transient overcommits, we throttle
+ * netfree transactions against a quota that is slightly larger,
+ * but still within the pool's allocation slop. In cases where
+ * we're very close to full, this will allow a steady trickle of
+ * removes to get through.
+ */
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
+ if (poolsize < quota) {
+ quota = poolsize;
+ edquot = ENOSPC;
+ }
+ } else if (netfree) {
+ quota = UINT64_MAX;
+ }
+
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota. They get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (asize > 0 && est_used > quota) {
+ if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
+ dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
+ dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
+ dd->dd_used_bytes < quota)
+ edquot = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+ "quota=%lluK tr=%lluK err=%d\n",
+ dd->dd_used_bytes>>10, est_used>>10,
+ quota>>10, asize>>10, edquot);
+ mutex_exit(&dd->dd_lock);
+ return (edquot);
+ }
+
+ /* We need to up our estimated delta before dropping dd_lock */
+ dd->dd_tempreserved[txgidx] += asize;
+
+ parent_rsrv = parent_delta(dd, est_used, asize);
+ mutex_exit(&dd->dd_lock);
+
+ tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = dd;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ /* see if it's OK with our parent */
+ if (dd->dd_parent && parent_rsrv) {
+ return (dsl_dir_tempreserve_impl(dd->dd_parent,
+ parent_rsrv, netfree, tr_list, tx));
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
+ uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+{
+ int err = 0;
+ list_t *tr_list;
+
+ tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(tr_list, sizeof (struct tempreserve),
+ offsetof(struct tempreserve, tr_node));
+ ASSERT3S(asize, >=, 0);
+ ASSERT3S(fsize, >=, 0);
+
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ tr_list, tx);
+
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ err = arc_tempreserve_space(lsize);
+ if (err == 0) {
+ tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = NULL;
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+ }
+ }
+
+ if (err)
+ dsl_dir_tempreserve_clear(tr_list, tx);
+ else
+ *tr_cookiep = tr_list;
+ return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+ int txgidx = tx->tx_txg & TXG_MASK;
+ list_t *tr_list = tr_cookie;
+ struct tempreserve *tr;
+
+ ASSERT3U(tx->tx_txg, !=, 0);
+
+ while (tr = list_head(tr_list)) {
+ if (tr->tr_ds == NULL) {
+ arc_tempreserve_clear(tr->tr_size);
+ } else {
+ mutex_enter(&tr->tr_ds->dd_lock);
+ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+ tr->tr_size);
+ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+ mutex_exit(&tr->tr_ds->dd_lock);
+ }
+ list_remove(tr_list, tr);
+ kmem_free(tr, sizeof (struct tempreserve));
+ }
+
+ kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data. Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ int64_t parent_space;
+ uint64_t est_used;
+
+ mutex_enter(&dd->dd_lock);
+ if (space > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+ est_used = dsl_dir_estimated_space(dd);
+ parent_space = parent_delta(dd, est_used, space);
+ mutex_exit(&dd->dd_lock);
+
+ /* Make sure that we clean up dd_space_to* */
+ dsl_dir_dirty(dd, tx);
+
+ /* XXX this is potentially expensive and unnecessary... */
+ if (parent_space && dd->dd_parent)
+ dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+ int64_t accounted_delta;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_dirty(dd, tx);
+
+ mutex_enter(&dd->dd_lock);
+ accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
+ ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+ ASSERT(compressed >= 0 ||
+ dd->dd_phys->dd_compressed_bytes >= -compressed);
+ ASSERT(uncompressed >= 0 ||
+ dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+ dd->dd_used_bytes += used;
+ dd->dd_phys->dd_uncompressed_bytes += uncompressed;
+ dd->dd_phys->dd_compressed_bytes += compressed;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ dsl_dir_diduse_space(dd->dd_parent,
+ accounted_delta, compressed, uncompressed, tx);
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+ int err = 0;
+ uint64_t towrite;
+
+ if (new_quota == 0)
+ return (0);
+
+ mutex_enter(&dd->dd_lock);
+ /*
+ * If we are doing the preliminary check in open context, and
+ * there are pending changes, then don't fail it, since the
+ * pending changes could under-estimat the amount of space to be
+ * freed up.
+ */
+ towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
+ dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
+ if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
+ (new_quota < dd->dd_phys->dd_reserved ||
+ new_quota < dsl_dir_estimated_space(dd))) {
+ err = ENOSPC;
+ }
+ mutex_exit(&dd->dd_lock);
+ return (err);
+}
+
+static void
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ dd->dd_phys->dd_quota = new_quota;
+ mutex_exit(&dd->dd_lock);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, uint64_t quota)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, dd, &quota, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used, avail;
+ int64_t delta;
+
+ if (new_reservation > INT64_MAX)
+ return (EOVERFLOW);
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent) {
+ avail = dsl_dir_space_available(dd->dd_parent,
+ NULL, 0, FALSE);
+ } else {
+ avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+ }
+
+ if (delta > 0 && delta > avail)
+ return (ENOSPC);
+ if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
+ new_reservation > dd->dd_phys->dd_quota)
+ return (ENOSPC);
+ return (0);
+}
+
+static void
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used;
+ int64_t delta;
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ mutex_exit(&dd->dd_lock);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_reserved = new_reservation;
+
+ if (dd->dd_parent != NULL) {
+ /* Roll up this additional usage into our ancestors */
+ dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+ }
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
+ dsl_dir_set_reservation_sync, dd, &reservation, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+ for (; ds1; ds1 = ds1->dd_parent) {
+ dsl_dir_t *dd;
+ for (dd = ds2; dd; dd = dd->dd_parent) {
+ if (ds1 == dd)
+ return (dd);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor? Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+ if (dd == ancestor)
+ return (delta);
+
+ mutex_enter(&dd->dd_lock);
+ delta = parent_delta(dd, dd->dd_used_bytes, delta);
+ mutex_exit(&dd->dd_lock);
+ return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+struct renamearg {
+ dsl_dir_t *newparent;
+ const char *mynewname;
+};
+
+/* ARGSUSED */
+static int
+dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct renamearg *ra = arg2;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+ uint64_t val;
+
+ /* There should be 2 references: the open and the dirty */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ return (EBUSY);
+
+ /* check for existing name */
+ err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
+ ra->mynewname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ if (ra->newparent != dd->dd_parent) {
+ /* is there enough space? */
+ uint64_t myspace =
+ MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+
+ /* no rename into our descendant */
+ if (closest_common_ancestor(dd, ra->newparent) == dd)
+ return (EINVAL);
+
+ if (err = dsl_dir_transfer_possible(dd->dd_parent,
+ ra->newparent, myspace))
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct renamearg *ra = arg2;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
+
+ if (ra->newparent != dd->dd_parent) {
+ uint64_t myspace =
+ MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+
+ dsl_dir_diduse_space(dd->dd_parent, -myspace,
+ -dd->dd_phys->dd_compressed_bytes,
+ -dd->dd_phys->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(ra->newparent, myspace,
+ dd->dd_phys->dd_compressed_bytes,
+ dd->dd_phys->dd_uncompressed_bytes, tx);
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ /* remove from old parent zapobj */
+ err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, tx);
+ ASSERT3U(err, ==, 0);
+
+ (void) strcpy(dd->dd_myname, ra->mynewname);
+ dsl_dir_close(dd->dd_parent, dd);
+ dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
+ ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
+
+ /* add to new parent zapobj */
+ err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx);
+ ASSERT3U(err, ==, 0);
+}
+
+int
+dsl_dir_rename(dsl_dir_t *dd, const char *newname)
+{
+ struct renamearg ra;
+ int err;
+
+ /* new parent should exist */
+ err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
+ if (err)
+ return (err);
+
+ /* can't rename to different pool */
+ if (dd->dd_pool != ra.newparent->dd_pool) {
+ err = ENXIO;
+ goto out;
+ }
+
+ /* new name should not already exist */
+ if (ra.mynewname == NULL) {
+ err = EEXIST;
+ goto out;
+ }
+
+
+ err = dsl_sync_task_do(dd->dd_pool,
+ dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
+
+out:
+ dsl_dir_close(ra.newparent, FTAG);
+ return (err);
+}
+
+int
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+{
+ dsl_dir_t *ancestor;
+ int64_t adelta;
+ uint64_t avail;
+
+ ancestor = closest_common_ancestor(sdd, tdd);
+ adelta = would_change(sdd, -space, ancestor);
+ avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
+ if (avail < space)
+ return (ENOSPC);
+
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
new file mode 100644
index 000000000000..7046254db8b7
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+
+static int
+dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
+{
+ uint64_t obj;
+ int err;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+ MOS_DIR_NAME, sizeof (obj), 1, &obj);
+ if (err)
+ return (err);
+
+ return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp;
+ blkptr_t *bp = spa_get_rootblkptr(spa);
+
+ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+ dp->dp_spa = spa;
+ dp->dp_meta_rootbp = *bp;
+ rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
+ txg_init(dp, txg);
+
+ txg_list_create(&dp->dp_dirty_datasets,
+ offsetof(dsl_dataset_t, ds_dirty_link));
+ txg_list_create(&dp->dp_dirty_dirs,
+ offsetof(dsl_dir_t, dd_dirty_link));
+ txg_list_create(&dp->dp_sync_tasks,
+ offsetof(dsl_sync_task_group_t, dstg_node));
+ list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+ offsetof(dsl_dataset_t, ds_synced_link));
+
+ return (dp);
+}
+
+int
+dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ objset_impl_t *osi;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+ if (err)
+ goto out;
+ dp->dp_meta_objset = &osi->os;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+ &dp->dp_root_dir_obj);
+ if (err)
+ goto out;
+
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir);
+ if (err)
+ goto out;
+
+ err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+ if (err)
+ goto out;
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ dsl_pool_close(dp);
+ else
+ *dpp = dp;
+
+ return (err);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+ /* drop our reference from dsl_pool_open() */
+ if (dp->dp_mos_dir)
+ dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_root_dir)
+ dsl_dir_close(dp->dp_root_dir, dp);
+
+ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+ if (dp->dp_meta_objset)
+ dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+
+ txg_list_destroy(&dp->dp_dirty_datasets);
+ txg_list_destroy(&dp->dp_dirty_dirs);
+ list_destroy(&dp->dp_synced_objsets);
+
+ arc_flush();
+ txg_fini(dp);
+ rw_destroy(&dp->dp_config_rwlock);
+ kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+ dp->dp_meta_objset = &dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+
+ /* create the pool directory */
+ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+ ASSERT3U(err, ==, 0);
+
+ /* create and open the root dir */
+ dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir));
+
+ /* create and open the meta-objset dir */
+ (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
+
+ dmu_tx_commit(tx);
+
+ return (dp);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ zio_t *zio;
+ dmu_tx_t *tx;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_sync_task_group_t *dstg;
+ objset_impl_t *mosi = dp->dp_meta_objset->os;
+ int err;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ if (!list_link_active(&ds->ds_synced_link))
+ list_insert_tail(&dp->dp_synced_objsets, ds);
+ else
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+
+ while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+ dsl_sync_task_group_sync(dstg, tx);
+ while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+ dsl_dir_sync(dd, tx);
+
+ if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+ list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(mosi, zio, tx);
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+void
+dsl_pool_zil_clean(dsl_pool_t *dp)
+{
+ dsl_dataset_t *ds;
+
+ while (ds = list_head(&dp->dp_synced_objsets)) {
+ list_remove(&dp->dp_synced_objsets, ds);
+ ASSERT(ds->ds_user_ptr != NULL);
+ zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ }
+}
+
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+ return (curthread == dp->dp_tx.tx_sync_thread ||
+ spa_get_dsl(dp->dp_spa) == NULL);
+}
+
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+{
+ uint64_t space, resv;
+
+ /*
+ * Reserve about 1.6% (1/64), or at least 32MB, for allocation
+ * efficiency.
+ * XXX The intent log is not accounted for, so it must fit
+ * within this slop.
+ *
+ * If we're trying to assess whether it's OK to do a free,
+ * cut the reservation in half to allow forward progress
+ * (e.g. make it possible to rm(1) files from a full pool).
+ */
+ space = spa_get_dspace(dp->dp_spa);
+ resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+ if (netfree)
+ resv >>= 1;
+
+ return (space - resv);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
new file mode 100644
index 000000000000..2fff66d06b1e
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -0,0 +1,501 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+static int
+dodefault(const char *propname, int intsz, int numint, void *buf)
+{
+ zfs_prop_t prop;
+
+ if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
+ zfs_prop_readonly(prop))
+ return (ENOENT);
+
+ if (zfs_prop_get_type(prop) == prop_type_string) {
+ if (intsz != 1)
+ return (EOVERFLOW);
+ (void) strncpy(buf, zfs_prop_default_string(prop), numint);
+ } else {
+ if (intsz != 8 || numint < 1)
+ return (EOVERFLOW);
+
+ *(uint64_t *)buf = zfs_prop_default_numeric(prop);
+ }
+
+ return (0);
+}
+
+static int
+dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
+ int intsz, int numint, void *buf, char *setpoint)
+{
+ int err = ENOENT;
+ zfs_prop_t prop;
+
+ if (setpoint)
+ setpoint[0] = '\0';
+
+ prop = zfs_name_to_prop(propname);
+
+ /*
+ * Note: dd may be NULL, therefore we shouldn't dereference it
+ * ouside this loop.
+ */
+ for (; dd != NULL; dd = dd->dd_parent) {
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ propname, intsz, numint, buf);
+ if (err != ENOENT) {
+ if (setpoint)
+ dsl_dir_name(dd, setpoint);
+ break;
+ }
+
+ /*
+ * Break out of this loop for non-inheritable properties.
+ */
+ if (prop != ZFS_PROP_INVAL &&
+ !zfs_prop_inheritable(prop))
+ break;
+ }
+ if (err == ENOENT)
+ err = dodefault(propname, intsz, numint, buf);
+
+ return (err);
+}
+
+/*
+ * Register interest in the named property. We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ uint64_t value;
+ dsl_prop_cb_record_t *cbr;
+ int err;
+ int need_rwlock;
+
+ need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
+ if (need_rwlock)
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
+ if (err != 0) {
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ return (err);
+ }
+
+ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+ cbr->cbr_ds = ds;
+ cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
+ (void) strcpy((char *)cbr->cbr_propname, propname);
+ cbr->cbr_func = callback;
+ cbr->cbr_arg = cbarg;
+ mutex_enter(&dd->dd_lock);
+ list_insert_head(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+ NULL, cbr, &dd));
+ if (need_rwlock)
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ /* Leave dataset open until this callback is unregistered */
+ return (0);
+}
+
+int
+dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ int err;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ return (err);
+}
+
+int
+dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ dsl_dir_t *dd;
+ const char *tail;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail && tail[0] != '@') {
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Get the current property value. It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+/*
+ * Unregister this callback. Return 0 on success, ENOENT if ddname is
+ * invalid, ENOMSG if no matching callback registered.
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds &&
+ cbr->cbr_func == callback &&
+ cbr->cbr_arg == cbarg &&
+ strcmp(cbr->cbr_propname, propname) == 0)
+ break;
+ }
+
+ if (cbr == NULL) {
+ mutex_exit(&dd->dd_lock);
+ return (ENOMSG);
+ }
+
+ list_remove(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+ kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+ /* Clean up from dsl_prop_register */
+ dsl_dir_close(dd, cbr);
+ return (0);
+}
+
+/*
+ * Return the number of callbacks that are registered for this dataset.
+ */
+int
+dsl_prop_numcb(dsl_dataset_t *ds)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+ int num = 0;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds)
+ num++;
+ }
+ mutex_exit(&dd->dd_lock);
+
+ return (num);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+ const char *propname, uint64_t value, int first)
+{
+ dsl_dir_t *dd;
+ dsl_prop_cb_record_t *cbr;
+ objset_t *mos = dp->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ return;
+
+ if (!first) {
+ /*
+ * If the prop is set here, then this change is not
+ * being inherited here or below; stop the recursion.
+ */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+ 8, 1, &value);
+ if (err == 0) {
+ dsl_dir_close(dd, FTAG);
+ return;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_propname, propname) == 0) {
+ cbr->cbr_func(cbr->cbr_arg, value);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ for (zap_cursor_init(&zc, mos,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ /* XXX recursion could blow stack; esp. za! */
+ dsl_prop_changed_notify(dp, za.za_first_integer,
+ propname, value, FALSE);
+ }
+ zap_cursor_fini(&zc);
+ dsl_dir_close(dd, FTAG);
+}
+
+struct prop_set_arg {
+ const char *name;
+ int intsz;
+ int numints;
+ const void *buf;
+};
+
+
+static void
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct prop_set_arg *psa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+ uint64_t intval;
+ int isint;
+
+ isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+
+ if (psa->numints == 0) {
+ int err = zap_remove(mos, zapobj, psa->name, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (isint) {
+ VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
+ psa->name, 8, 1, &intval, NULL));
+ }
+ } else {
+ VERIFY(0 == zap_update(mos, zapobj, psa->name,
+ psa->intsz, psa->numints, psa->buf, tx));
+ if (isint)
+ intval = *(uint64_t *)psa->buf;
+ }
+
+ if (isint) {
+ dsl_prop_changed_notify(dd->dd_pool,
+ dd->dd_object, psa->name, intval, TRUE);
+ }
+}
+
+int
+dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ struct prop_set_arg psa;
+
+ psa.name = propname;
+ psa.intsz = intsz;
+ psa.numints = numints;
+ psa.buf = buf;
+
+ return (dsl_sync_task_do(dd->dd_pool,
+ NULL, dsl_prop_set_sync, dd, &psa, 2));
+}
+
+int
+dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ /*
+ * We must do these checks before we get to the syncfunc, since
+ * it can't fail.
+ */
+ if (strlen(propname) >= ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+ if (intsz * numints >= ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dir_t *dd = ds->ds_dir;
+ int err = 0;
+ dsl_pool_t *dp;
+ objset_t *mos;
+
+ if (dsl_dataset_is_snapshot(ds)) {
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ return (0);
+ }
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ dp = dd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (; dd != NULL; dd = dd->dd_parent) {
+ char setpoint[MAXNAMELEN];
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ dsl_dir_name(dd, setpoint);
+
+ for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ /*
+ * Skip non-inheritable properties.
+ */
+ if ((prop = zfs_name_to_prop(za.za_name)) !=
+ ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) &&
+ dd != ds->ds_dir)
+ continue;
+
+ if (nvlist_lookup_nvlist(*nvp, za.za_name,
+ &propval) == 0)
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos,
+ dd->dd_phys->dd_props_zapobj,
+ za.za_name, 1, za.za_num_integers,
+ tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval,
+ ZFS_PROP_VALUE, tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval,
+ ZFS_PROP_VALUE, za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval,
+ ZFS_PROP_SOURCE, setpoint) == 0);
+ VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
+ propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+
+ if (err != ENOENT)
+ break;
+ err = 0;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ return (err);
+}
+
+void
+dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ nvlist_free(propval);
+}
+
+void
+dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ nvlist_free(propval);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
new file mode 100644
index 000000000000..17deb569c4ab
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -0,0 +1,196 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+/* ARGSUSED */
+static int
+dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ return (0);
+}
+
+dsl_sync_task_group_t *
+dsl_sync_task_group_create(dsl_pool_t *dp)
+{
+ dsl_sync_task_group_t *dstg;
+
+ dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
+ list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
+ offsetof(dsl_sync_task_t, dst_node));
+ dstg->dstg_pool = dp;
+
+ return (dstg);
+}
+
+void
+dsl_sync_task_create(dsl_sync_task_group_t *dstg,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified)
+{
+ dsl_sync_task_t *dst;
+
+ if (checkfunc == NULL)
+ checkfunc = dsl_null_checkfunc;
+ dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
+ dst->dst_checkfunc = checkfunc;
+ dst->dst_syncfunc = syncfunc;
+ dst->dst_arg1 = arg1;
+ dst->dst_arg2 = arg2;
+ list_insert_tail(&dstg->dstg_tasks, dst);
+
+ dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
+}
+
+int
+dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
+{
+ dmu_tx_t *tx;
+ uint64_t txg;
+ dsl_sync_task_t *dst;
+
+top:
+ tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+
+ /* Do a preliminary error check. */
+ dstg->dstg_err = 0;
+ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+#ifdef ZFS_DEBUG
+ /*
+ * Only check half the time, otherwise, the sync-context
+ * check will almost never fail.
+ */
+ if (spa_get_random(2) == 0)
+ continue;
+#endif
+ dst->dst_err =
+ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ if (dst->dst_err)
+ dstg->dstg_err = dst->dst_err;
+ }
+ rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+
+ if (dstg->dstg_err) {
+ dmu_tx_commit(tx);
+ return (dstg->dstg_err);
+ }
+
+ VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+
+ dmu_tx_commit(tx);
+
+ txg_wait_synced(dstg->dstg_pool, txg);
+
+ if (dstg->dstg_err == EAGAIN)
+ goto top;
+
+ return (dstg->dstg_err);
+}
+
+void
+dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
+{
+ dsl_sync_task_t *dst;
+
+ while (dst = list_head(&dstg->dstg_tasks)) {
+ list_remove(&dstg->dstg_tasks, dst);
+ kmem_free(dst, sizeof (dsl_sync_task_t));
+ }
+ kmem_free(dstg, sizeof (dsl_sync_task_group_t));
+}
+
+void
+dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+{
+ dsl_sync_task_t *dst;
+ void *tr_cookie;
+
+ ASSERT3U(dstg->dstg_err, ==, 0);
+
+ /*
+ * Check for sufficient space.
+ */
+ dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
+ dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
+ /* don't bother trying again */
+ if (dstg->dstg_err == ERESTART)
+ dstg->dstg_err = EAGAIN;
+ if (dstg->dstg_err)
+ return;
+
+ /*
+ * Check for errors by calling checkfuncs.
+ */
+ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+ dst->dst_err =
+ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ if (dst->dst_err)
+ dstg->dstg_err = dst->dst_err;
+ }
+
+ if (dstg->dstg_err == 0) {
+ /*
+ * Execute sync tasks.
+ */
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ }
+ }
+ rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+
+ dsl_dir_tempreserve_clear(tr_cookie, tx);
+}
+
+int
+dsl_sync_task_do(dsl_pool_t *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified)
+{
+ dsl_sync_task_group_t *dstg;
+ int err;
+
+ dstg = dsl_sync_task_group_create(dp);
+ dsl_sync_task_create(dstg, checkfunc, syncfunc,
+ arg1, arg2, blocks_modified);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+ return (err);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
new file mode 100644
index 000000000000..edda3c9a9d3d
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c
new file mode 100644
index 000000000000..b257d4af753c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/zmod.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+
+size_t
+gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len <= s_len);
+
+ if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
+ if (d_len != s_len)
+ return (s_len);
+
+ bcopy(s_start, d_start, s_len);
+ return (s_len);
+ }
+
+ return (dstlen);
+}
+
+/*ARGSUSED*/
+int
+gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len >= s_len);
+
+ if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
+ return (-1);
+
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
new file mode 100644
index 000000000000..a88b85c7ec25
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * We keep our own copy of this algorithm for 2 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
+ * directly break our on disk format
+ * 2. Our version of lzjb does not have a number of checks that the
+ * common/os version needs and uses
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 256
+
+/*ARGSUSED*/
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *cpy, *copymap;
+ int copymask = 1 << (NBBY - 1);
+ int mlen, offset;
+ uint16_t *hp;
+ uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+
+ while (src < (uchar_t *)s_start + s_len) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+ if (d_len != s_len)
+ return (s_len);
+ mlen = s_len;
+ for (src = s_start, dst = d_start; mlen; mlen--)
+ *dst++ = *src++;
+ return (s_len);
+ }
+ copymask = 1;
+ copymap = dst;
+ *dst++ = 0;
+ }
+ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+ *dst++ = *src++;
+ continue;
+ }
+ hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
+ (LEMPEL_SIZE - 1)];
+ offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+ *hp = (uint16_t)(uintptr_t)src;
+ cpy = src - offset;
+ if (cpy >= (uchar_t *)s_start && cpy != src &&
+ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+ *copymap |= copymask;
+ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+ if (src[mlen] != cpy[mlen])
+ break;
+ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+ (offset >> NBBY);
+ *dst++ = (uchar_t)offset;
+ src += mlen;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *d_end = (uchar_t *)d_start + d_len;
+ uchar_t *cpy, copymap;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (uchar_t *)d_start)
+ return (-1);
+ while (--mlen >= 0 && dst < d_end)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
new file mode 100644
index 000000000000..0dba134cef9b
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -0,0 +1,1023 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+uint64_t metaslab_aliquot = 512ULL << 10;
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(void)
+{
+ metaslab_class_t *mc;
+
+ mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+ mc->mc_rotor = NULL;
+
+ return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+
+ while ((mg = mc->mc_rotor) != NULL) {
+ metaslab_class_remove(mc, mg);
+ metaslab_group_destroy(mg);
+ }
+
+ kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+void
+metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == NULL);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+ mg->mg_class = mc;
+}
+
+void
+metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == mc);
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+ mg->mg_class = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+ const metaslab_t *m1 = x1;
+ const metaslab_t *m2 = x2;
+
+ if (m1->ms_weight < m2->ms_weight)
+ return (1);
+ if (m1->ms_weight > m2->ms_weight)
+ return (-1);
+
+ /*
+ * If the weights are identical, use the offset to force uniqueness.
+ */
+ if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+ return (-1);
+ if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+ return (1);
+
+ ASSERT3P(m1, ==, m2);
+
+ return (0);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+{
+ metaslab_group_t *mg;
+
+ mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+ sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
+ mg->mg_vd = vd;
+ metaslab_class_add(mc, mg);
+
+ return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+ avl_destroy(&mg->mg_metaslab_tree);
+ mutex_destroy(&mg->mg_lock);
+ kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == NULL);
+ msp->ms_group = mg;
+ msp->ms_weight = 0;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_group = NULL;
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ /*
+ * Although in principle the weight can be any value, in
+ * practice we do not use values in the range [1, 510].
+ */
+ ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ space_seg_t *ss, ssearch;
+ avl_index_t where;
+
+ ssearch.ss_start = *cursor;
+ ssearch.ss_end = *cursor + size;
+
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ while (ss != NULL) {
+ uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+
+ if (offset + size <= ss->ss_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ ss = AVL_NEXT(t, ss);
+ }
+
+ /*
+ * If we know we've searched the whole map (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0)
+ return (-1ULL);
+
+ *cursor = 0;
+ return (metaslab_ff_alloc(sm, size));
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_ff_ops = {
+ metaslab_ff_load,
+ metaslab_ff_unload,
+ metaslab_ff_alloc,
+ metaslab_ff_claim,
+ metaslab_ff_free
+};
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+metaslab_t *
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ uint64_t start, uint64_t size, uint64_t txg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_t *msp;
+
+ msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+ mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ msp->ms_smo_syncing = *smo;
+
+ /*
+ * We create the main space map here, but we don't create the
+ * allocmaps and freemaps until metaslab_sync_done(). This serves
+ * two purposes: it allows metaslab_sync_done() to detect the
+ * addition of new space; and for debugging, it ensures that we'd
+ * data fault on any attempt to use this metaslab before it's ready.
+ */
+ space_map_create(&msp->ms_map, start, size,
+ vd->vdev_ashift, &msp->ms_lock);
+
+ metaslab_group_add(mg, msp);
+
+ /*
+ * If we're opening an existing pool (txg == 0) or creating
+ * a new one (txg == TXG_INITIAL), all space is available now.
+ * If we're adding space to an existing pool, the new space
+ * does not become available until after this txg has synced.
+ */
+ if (txg <= TXG_INITIAL)
+ metaslab_sync_done(msp, 0);
+
+ if (txg != 0) {
+ /*
+ * The vdev is dirty, but the metaslab isn't -- it just needs
+ * to have metaslab_sync_done() invoked from vdev_sync_done().
+ * [We could just dirty the metaslab, but that would cause us
+ * to allocate a space map object for it, which is wasteful
+ * and would mess up the locality logic in metaslab_weight().]
+ */
+ ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+ }
+
+ return (msp);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ int t;
+
+ vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+ -msp->ms_smo.smo_alloc);
+
+ metaslab_group_remove(mg, msp);
+
+ mutex_enter(&msp->ms_lock);
+
+ space_map_unload(&msp->ms_map);
+ space_map_destroy(&msp->ms_map);
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ space_map_destroy(&msp->ms_allocmap[t]);
+ space_map_destroy(&msp->ms_freemap[t]);
+ }
+
+ mutex_exit(&msp->ms_lock);
+ mutex_destroy(&msp->ms_lock);
+
+ kmem_free(msp, sizeof (metaslab_t));
+}
+
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define METASLAB_SMO_BONUS_MULTIPLIER 2
+
+static uint64_t
+metaslab_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t weight, space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The baseline weight is the metaslab's free space.
+ */
+ space = sm->sm_size - smo->smo_alloc;
+ weight = space;
+
+ /*
+ * Modern disks have uniform bit density and constant angular velocity.
+ * Therefore, the outer recording zones are faster (higher bandwidth)
+ * than the inner zones by the ratio of outer to inner track diameter,
+ * which is typically around 2:1. We account for this by assigning
+ * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+ * In effect, this means that we'll select the metaslab with the most
+ * free bandwidth rather than simply the one with the most free space.
+ */
+ weight = 2 * weight -
+ ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
+ ASSERT(weight >= space && weight <= 2 * space);
+
+ /*
+ * For locality, assign higher weight to metaslabs we've used before.
+ */
+ if (smo->smo_object != 0)
+ weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+ ASSERT(weight >= space &&
+ weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+
+ /*
+ * If this metaslab is one we're actively using, adjust its weight to
+ * make it preferable to any inactive metaslab so we'll polish it off.
+ */
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (weight);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+{
+ space_map_t *sm = &msp->ms_map;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int error = space_map_load(sm, &metaslab_ff_ops,
+ SM_FREE, &msp->ms_smo,
+ msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+ if (error) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+ metaslab_group_sort(msp->ms_group, msp,
+ msp->ms_weight | activation_weight);
+ }
+ ASSERT(sm->sm_loaded);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (0);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t size)
+{
+ /*
+ * If size < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. In that case, it had better be empty,
+ * or we would be leaving space on the table.
+ */
+ ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+ metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+ ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+ space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo_syncing;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ int t;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ /*
+ * The only state that can actually be changing concurrently with
+ * metaslab_sync() is the metaslab's ms_map. No other thread can
+ * be modifying this txg's allocmap, freemap, freed_map, or smo.
+ * Therefore, we only hold ms_lock to satify space_map ASSERTs.
+ * We drop it whenever we call into the DMU, because the DMU
+ * can call down to us (e.g. via zio_free()) at any time.
+ */
+ mutex_enter(&msp->ms_lock);
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ mutex_exit(&msp->ms_lock);
+ smo->smo_object = dmu_object_alloc(mos,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ (sm->sm_start >> vd->vdev_ms_shift),
+ sizeof (uint64_t), &smo->smo_object, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ space_map_walk(freemap, space_map_add, freed_map);
+
+ if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
+ 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
+ /*
+ * The in-core space map representation is twice as compact
+ * as the on-disk one, so it's time to condense the latter
+ * by generating a pure allocmap from first principles.
+ *
+ * This metaslab is 100% allocated,
+ * minus the content of the in-core map (sm),
+ * minus what's been freed this txg (freed_map),
+ * minus allocations from txgs in the future
+ * (because they haven't been committed yet).
+ */
+ space_map_vacate(allocmap, NULL, NULL);
+ space_map_vacate(freemap, NULL, NULL);
+
+ space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
+
+ space_map_walk(sm, space_map_remove, allocmap);
+ space_map_walk(freed_map, space_map_remove, allocmap);
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
+ space_map_remove, allocmap);
+
+ mutex_exit(&msp->ms_lock);
+ space_map_truncate(smo, mos, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
+ space_map_sync(freemap, SM_FREE, smo, mos, tx);
+
+ mutex_exit(&msp->ms_lock);
+
+ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(smo, db->db_data, db->db_size);
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+ space_map_obj_t *smo = &msp->ms_smo;
+ space_map_obj_t *smosync = &msp->ms_smo_syncing;
+ space_map_t *sm = &msp->ms_map;
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ int t;
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If this metaslab is just becoming available, initialize its
+ * allocmaps and freemaps and add its capacity to the vdev.
+ */
+ if (freed_map->sm_size == 0) {
+ for (t = 0; t < TXG_SIZE; t++) {
+ space_map_create(&msp->ms_allocmap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+ space_map_create(&msp->ms_freemap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+ }
+ vdev_space_update(vd, sm->sm_size, 0);
+ }
+
+ vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+
+ ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
+ ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+
+ /*
+ * If there's a space_map_load() in progress, wait for it to complete
+ * so that we have a consistent view of the in-core space map.
+ * Then, add everything we freed in this txg to the map.
+ */
+ space_map_load_wait(sm);
+ space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+
+ *smo = *smosync;
+
+ /*
+ * If the map is loaded but no longer active, evict it as soon as all
+ * future allocations have synced. (If we unloaded it now and then
+ * loaded a moment later, the map wouldn't reflect those allocations.)
+ */
+ if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int evictable = 1;
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
+ evictable = 0;
+
+ if (evictable)
+ space_map_unload(sm);
+ }
+
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+
+ mutex_exit(&msp->ms_lock);
+}
+
+static uint64_t
+metaslab_distance(metaslab_t *msp, dva_t *dva)
+{
+ uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
+ uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
+ uint64_t start = msp->ms_map.sm_start >> ms_shift;
+
+ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+ return (1ULL << 63);
+
+ if (offset < start)
+ return ((start - offset) << ms_shift);
+ if (offset > start)
+ return ((offset - start) << ms_shift);
+ return (0);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
+ uint64_t min_distance, dva_t *dva, int d)
+{
+ metaslab_t *msp = NULL;
+ uint64_t offset = -1ULL;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ uint64_t activation_weight;
+ uint64_t target_distance;
+ int i;
+
+ activation_weight = METASLAB_WEIGHT_PRIMARY;
+ for (i = 0; i < d; i++)
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ activation_weight = METASLAB_WEIGHT_SECONDARY;
+
+ for (;;) {
+ mutex_enter(&mg->mg_lock);
+ for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
+ if (msp->ms_weight < size) {
+ mutex_exit(&mg->mg_lock);
+ return (-1ULL);
+ }
+
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY)
+ break;
+
+ target_distance = min_distance +
+ (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+
+ for (i = 0; i < d; i++)
+ if (metaslab_distance(msp, &dva[i]) <
+ target_distance)
+ break;
+ if (i == d)
+ break;
+ }
+ mutex_exit(&mg->mg_lock);
+ if (msp == NULL)
+ return (-1ULL);
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Ensure that the metaslab we have selected is still
+ * capable of handling our request. It's possible that
+ * another thread may have changed the weight while we
+ * were blocked on the metaslab lock.
+ */
+ if (msp->ms_weight < size) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY) {
+ metaslab_passivate(msp,
+ msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (metaslab_activate(msp, activation_weight) != 0) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+ break;
+
+ metaslab_passivate(msp, size - 1);
+
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+
+ return (offset);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+static int
+metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
+ dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
+{
+ metaslab_group_t *mg, *rotor;
+ metaslab_class_t *mc;
+ vdev_t *vd;
+ int dshift = 3;
+ int all_zero;
+ uint64_t offset = -1ULL;
+ uint64_t asize;
+ uint64_t distance;
+
+ ASSERT(!DVA_IS_VALID(&dva[d]));
+
+ mc = spa_metaslab_class_select(spa);
+
+ /*
+ * Start at the rotor and loop through all mgs until we find something.
+ * Note that there's no locking on mc_rotor or mc_allocated because
+ * nothing actually breaks if we miss a few updates -- we just won't
+ * allocate quite as evenly. It all balances out over time.
+ *
+ * If we are doing ditto or log blocks, try to spread them across
+ * consecutive vdevs. If we're forced to reuse a vdev before we've
+ * allocated all of our ditto blocks, then try and spread them out on
+ * that vdev as much as possible. If it turns out to not be possible,
+ * gradually lower our standards until anything becomes acceptable.
+ * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+ * gives us hope of containing our fault domains to something we're
+ * able to reason about. Otherwise, any two top-level vdev failures
+ * will guarantee the loss of data. With consecutive allocation,
+ * only two adjacent top-level vdev failures will result in data loss.
+ *
+ * If we are doing gang blocks (hintdva is non-NULL), try to keep
+ * ourselves on the same vdev as our gang block header. That
+ * way, we can hope for locality in vdev_cache, plus it makes our
+ * fault domains something tractable.
+ */
+ if (hintdva) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+ if (hintdva_avoid)
+ mg = vd->vdev_mg->mg_next;
+ else
+ mg = vd->vdev_mg;
+ } else if (d != 0) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+ mg = vd->vdev_mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
+ rotor = mg;
+
+top:
+ all_zero = B_TRUE;
+ do {
+ vd = mg->mg_vd;
+
+ distance = vd->vdev_asize >> dshift;
+ if (distance <= (1ULL << vd->vdev_ms_shift))
+ distance = 0;
+ else
+ all_zero = B_FALSE;
+
+ asize = vdev_psize_to_asize(vd, psize);
+ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+ offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+ if (offset != -1ULL) {
+ /*
+ * If we've just selected this metaslab group,
+ * figure out whether the corresponding vdev is
+ * over- or under-used relative to the pool,
+ * and set an allocation bias to even it out.
+ */
+ if (mc->mc_allocated == 0) {
+ vdev_stat_t *vs = &vd->vdev_stat;
+ uint64_t alloc, space;
+ int64_t vu, su;
+
+ alloc = spa_get_alloc(spa);
+ space = spa_get_space(spa);
+
+ /*
+ * Determine percent used in units of 0..1024.
+ * (This is just to avoid floating point.)
+ */
+ vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
+ su = (alloc << 10) / (space + 1);
+
+ /*
+ * Bias by at most +/- 25% of the aliquot.
+ */
+ mg->mg_bias = ((su - vu) *
+ (int64_t)mg->mg_aliquot) / (1024 * 4);
+ }
+
+ if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ mg->mg_aliquot + mg->mg_bias) {
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ }
+
+ DVA_SET_VDEV(&dva[d], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[d], offset);
+ DVA_SET_GANG(&dva[d], 0);
+ DVA_SET_ASIZE(&dva[d], asize);
+
+ return (0);
+ }
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ } while ((mg = mg->mg_next) != rotor);
+
+ if (!all_zero) {
+ dshift++;
+ ASSERT(dshift < 64);
+ goto top;
+ }
+
+ bzero(&dva[d], sizeof (dva_t));
+
+ return (ENOSPC);
+}
+
+/*
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
+ */
+static void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+ cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)offset);
+ ASSERT(0);
+ return;
+ }
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ if (now) {
+ space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+ offset, size);
+ space_map_free(&msp->ms_map, offset, size);
+ } else {
+ if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+
+ /*
+ * verify that this region is actually allocated in
+ * either a ms_allocmap or the ms_map
+ */
+ if (msp->ms_map.sm_loaded) {
+ boolean_t allocd = B_FALSE;
+ int i;
+
+ if (!space_map_contains(&msp->ms_map, offset, size)) {
+ allocd = B_TRUE;
+ } else {
+ for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
+ space_map_t *sm = &msp->ms_allocmap
+ [(txg - i) & TXG_MASK];
+ if (space_map_contains(sm,
+ offset, size)) {
+ allocd = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (!allocd) {
+ zfs_panic_recover("freeing free segment "
+ "(vdev=%llu offset=%llx size=%llx)",
+ (longlong_t)vdev, (longlong_t)offset,
+ (longlong_t)size);
+ }
+ }
+
+
+ }
+
+ mutex_exit(&msp->ms_lock);
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+ int error;
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ if (error) {
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+
+ space_map_claim(&msp->ms_map, offset, size);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
+ uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
+{
+ dva_t *dva = bp->blk_dva;
+ dva_t *hintdva = hintbp->blk_dva;
+ int d;
+ int error = 0;
+
+ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+ ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+
+ for (d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
+ txg, hintbp_avoid);
+ if (error) {
+ for (d--; d >= 0; d--) {
+ metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+ bzero(&dva[d], sizeof (dva_t));
+ }
+ return (error);
+ }
+ }
+ ASSERT(error == 0);
+ ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+ return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ for (d = 0; d < ndvas; d++)
+ metaslab_free_dva(spa, &dva[d], txg, now);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d, error;
+ int last_error = 0;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ for (d = 0; d < ndvas; d++)
+ if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+ last_error = error;
+
+ return (last_error);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c
new file mode 100644
index 000000000000..411ed46e13d7
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#if defined(DEBUG) || !defined(_KERNEL)
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 4; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+refcount_init(void)
+{
+ reference_cache = kmem_cache_create("reference_cache",
+ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+refcount_fini(void)
+{
+ kmem_cache_destroy(reference_cache);
+ kmem_cache_destroy(reference_history_cache);
+}
+
+void
+refcount_create(refcount_t *rc)
+{
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&rc->rc_removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+refcount_destroy_many(refcount_t *rc, uint64_t number)
+{
+ reference_t *ref;
+
+ ASSERT(rc->rc_count == number);
+ while (ref = list_head(&rc->rc_list)) {
+ list_remove(&rc->rc_list, ref);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_list);
+
+ while (ref = list_head(&rc->rc_removed)) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_removed);
+ mutex_destroy(&rc->rc_mtx);
+}
+
+void
+refcount_destroy(refcount_t *rc)
+{
+ refcount_destroy_many(rc, 0);
+}
+
+int
+refcount_is_zero(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count == 0);
+}
+
+int64_t
+refcount_count(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count);
+}
+
+int64_t
+refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ if (reference_tracking_enable) {
+ ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+ ref->ref_holder = holder;
+ ref->ref_number = number;
+ }
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= 0);
+ if (reference_tracking_enable)
+ list_insert_head(&rc->rc_list, ref);
+ rc->rc_count += number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+
+ return (count);
+}
+
+int64_t
+refcount_add(refcount_t *rc, void *holder)
+{
+ return (refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= number);
+
+ if (!reference_tracking_enable) {
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count >= reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+ }
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+}
+
+int64_t
+refcount_remove(refcount_t *rc, void *holder)
+{
+ return (refcount_remove_many(rc, 1, holder));
+}
+
+#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c
new file mode 100644
index 000000000000..ce5c26131af5
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+ * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
+#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+ uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+ for (t = 0; t < 16; t++, cp += 4)
+ W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+ for (t = 16; t < 64; t++)
+ W[t] = sigma1(W[t - 2]) + W[t - 7] +
+ sigma0(W[t - 15]) + W[t - 16];
+
+ a = H[0]; b = H[1]; c = H[2]; d = H[3];
+ e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+ for (t = 0; t < 64; t++) {
+ T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+ T2 = SIGMA0(a) + Maj(a, b, c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+ H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+ uint8_t pad[128];
+ int padsize = size & 63;
+ int i;
+
+ for (i = 0; i < size - padsize; i += 64)
+ SHA256Transform(H, (uint8_t *)buf + i);
+
+ for (i = 0; i < padsize; i++)
+ pad[i] = ((uint8_t *)buf)[i];
+
+ for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+ pad[padsize] = 0;
+
+ for (i = 0; i < 8; i++)
+ pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+ for (i = 0; i < padsize; i += 64)
+ SHA256Transform(H, pad + i);
+
+ ZIO_SET_CHECKSUM(zcp,
+ (uint64_t)H[0] << 32 | H[1],
+ (uint64_t)H[2] << 32 | H[3],
+ (uint64_t)H[4] << 32 | H[5],
+ (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c
new file mode 100644
index 000000000000..c218f72de36f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -0,0 +1,3265 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/callb.h>
+
+int zio_taskq_threads = 8;
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+ spa_error_entry_t *sa = (spa_error_entry_t *)a;
+ spa_error_entry_t *sb = (spa_error_entry_t *)b;
+ int ret;
+
+ ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+ sizeof (zbookmark_t));
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+
+ spa->spa_normal_class = metaslab_class_create();
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
+ zio_taskq_threads, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
+ zio_taskq_threads, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ }
+
+ rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
+
+ mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&spa->spa_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_dirty_node));
+
+ txg_list_create(&spa->spa_vdev_txg_list,
+ offsetof(struct vdev, vdev_txg_node));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_sync_on == B_FALSE);
+ ASSERT(spa->spa_dsl_pool == NULL);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_dirty_list);
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ taskq_destroy(spa->spa_zio_issue_taskq[t]);
+ taskq_destroy(spa->spa_zio_intr_taskq[t]);
+ spa->spa_zio_issue_taskq[t] = NULL;
+ spa->spa_zio_intr_taskq[t] = NULL;
+ }
+
+ metaslab_class_destroy(spa->spa_normal_class);
+ spa->spa_normal_class = NULL;
+
+ /*
+ * If this was part of an import or the open otherwise failed, we may
+ * still have errors left in the queues. Empty them just in case.
+ */
+ spa_errlog_drain(spa);
+
+ avl_destroy(&spa->spa_errlist_scrub);
+ avl_destroy(&spa->spa_errlist_last);
+
+ rw_destroy(&spa->spa_traverse_lock);
+ mutex_destroy(&spa->spa_uberblock_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
+ mutex_destroy(&spa->spa_errlist_lock);
+ mutex_destroy(&spa->spa_config_lock.scl_lock);
+ cv_destroy(&spa->spa_config_lock.scl_cv);
+ mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+ mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_props_lock);
+
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately. This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state. This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static int
+spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
+ uint_t id, int atype)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ int error;
+
+ if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
+ return (error);
+
+ if ((*vdp)->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < children; c++) {
+ vdev_t *vd;
+ if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
+ atype)) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (error);
+ }
+ }
+
+ ASSERT(*vdp != NULL);
+
+ return (0);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+ int i;
+
+ /*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
+
+ /*
+ * Stop syncing.
+ */
+ if (spa->spa_sync_on) {
+ txg_sync_stop(spa->spa_dsl_pool);
+ spa->spa_sync_on = B_FALSE;
+ }
+
+ /*
+ * Wait for any outstanding prefetch I/O to complete.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Close the dsl pool.
+ */
+ if (spa->spa_dsl_pool) {
+ dsl_pool_close(spa->spa_dsl_pool);
+ spa->spa_dsl_pool = NULL;
+ }
+
+ /*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev)
+ vdev_free(spa->spa_root_vdev);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ for (i = 0; i < spa->spa_nspares; i++)
+ vdev_free(spa->spa_spares[i]);
+ if (spa->spa_spares) {
+ kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
+ spa->spa_spares = NULL;
+ }
+ if (spa->spa_sparelist) {
+ nvlist_free(spa->spa_sparelist);
+ spa->spa_sparelist = NULL;
+ }
+
+ spa->spa_async_suspended = 0;
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active spares for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_sparelist'. We parse this into vdevs, try to open them, and then
+ * re-generate a more complete list including status information.
+ */
+static void
+spa_load_spares(spa_t *spa)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int i;
+ vdev_t *vd, *tvd;
+
+ /*
+ * First, close and free any existing spare vdevs.
+ */
+ for (i = 0; i < spa->spa_nspares; i++) {
+ vd = spa->spa_spares[i];
+
+ /* Undo the call to spa_activate() below */
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
+ tvd->vdev_isspare)
+ spa_spare_remove(tvd);
+ vdev_close(vd);
+ vdev_free(vd);
+ }
+
+ if (spa->spa_spares)
+ kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
+
+ if (spa->spa_sparelist == NULL)
+ nspares = 0;
+ else
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ spa->spa_nspares = (int)nspares;
+ spa->spa_spares = NULL;
+
+ if (nspares == 0)
+ return;
+
+ /*
+ * Construct the array of vdevs, opening them to get status in the
+ * process. For each spare, there is potentially two different vdev_t
+ * structures associated with it: one in the list of spares (used only
+ * for basic validation purposes) and one in the active vdev
+ * configuration (if it's spared in). During this phase we open and
+ * validate each vdev on the spare list. If the vdev also exists in the
+ * active configuration, then we also mark this vdev as an active spare.
+ */
+ spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < spa->spa_nspares; i++) {
+ VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ VDEV_ALLOC_SPARE) == 0);
+ ASSERT(vd != NULL);
+
+ spa->spa_spares[i] = vd;
+
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
+ if (!tvd->vdev_isspare)
+ spa_spare_add(tvd);
+
+ /*
+ * We only mark the spare active if we were successfully
+ * able to load the vdev. Otherwise, importing a pool
+ * with a bad active spare would result in strange
+ * behavior, because multiple pool would think the spare
+ * is actively in use.
+ *
+ * There is a vulnerability here to an equally bizarre
+ * circumstance, where a dead active spare is later
+ * brought back to life (onlined or otherwise). Given
+ * the rarity of this scenario, and the extra complexity
+ * it adds, we ignore the possibility.
+ */
+ if (!vdev_is_dead(tvd))
+ spa_spare_activate(tvd);
+ }
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ vd->vdev_top = vd;
+ (void) vdev_validate_spare(vd);
+ }
+
+ /*
+ * Recompute the stashed list of spares, with status information
+ * this time.
+ */
+ VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < spa->spa_nspares; i++)
+ spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
+ B_TRUE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ spares, spa->spa_nspares) == 0);
+ for (i = 0; i < spa->spa_nspares; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_nspares * sizeof (void *));
+}
+
+static int
+load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
+{
+ dmu_buf_t *db;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ int error;
+ *value = NULL;
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
+ if (error == 0)
+ error = nvlist_unpack(packed, nvsize, value, 0);
+ kmem_free(packed, nvsize);
+
+ return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t config_cache_txg = spa->spa_config_txg;
+ uint64_t pool_guid;
+ uint64_t version;
+ zio_t *zio;
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
+ version = ZFS_VERSION_INITIAL;
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+ error = EEXIST;
+ goto out;
+ }
+
+ spa->spa_load_guid = pool_guid;
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa->spa_ubsync.ub_version = version;
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0)
+ goto out;
+
+ ASSERT(spa->spa_root_vdev == rvd);
+ ASSERT(spa_guid(spa) == pool_guid);
+
+ /*
+ * Try to open all vdevs, loading each label in the process.
+ */
+ if (vdev_open(rvd) != 0) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Validate the labels for all leaf vdevs. We need to grab the config
+ * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
+ * flag.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ bzero(ub, sizeof (uberblock_t));
+
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ vdev_uberblock_load(zio, rvd, ub);
+ error = zio_wait(zio);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * If the pool is newer than the code, we can't open it.
+ */
+ if (ub->ub_version > ZFS_VERSION) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_VERSION_NEWER);
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * If the vdev guid sum doesn't match the uberblock, we have an
+ * incomplete configuration.
+ */
+ if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_GUID_SUM);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+ error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ goto out;
+ }
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ if (zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (!mosconfig) {
+ nvlist_t *newconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_set(spa, newconfig);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa);
+
+ return (spa_load(spa, newconfig, state, B_TRUE));
+ }
+
+ if (zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the bit that tells us to use the new accounting function
+ * (raid-z deflation). If we have an older pool, this will not
+ * be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+ sizeof (uint64_t), 1, &spa->spa_errlog_last);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+ sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the history object. If we have an older pool, this
+ * will not be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
+ sizeof (uint64_t), 1, &spa->spa_history);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load any hot spares for this pool.
+ */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+ if (error == 0) {
+ ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
+ if (load_nvlist(spa, spa->spa_spares_object,
+ &spa->spa_sparelist) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ }
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
+
+ if (error && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (error == 0) {
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS),
+ sizeof (uint64_t), 1, &spa->spa_bootfs);
+ }
+
+ /*
+ * Load the vdev state for all toplevel vdevs.
+ */
+ vdev_load(rvd);
+
+ /*
+ * Propagate the leaf DTLs we just loaded all the way up the tree.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Check the state of the root vdev. If it can't be opened, it
+ * indicates one or more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
+
+ if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+ dmu_tx_t *tx;
+ int need_update = B_FALSE;
+ int c;
+
+ /*
+ * Claim log blocks that haven't been committed yet.
+ * This must all happen in a single txg.
+ */
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+ spa_first_txg(spa));
+ (void) dmu_objset_find(spa->spa_name,
+ zil_claim, tx, DS_FIND_CHILDREN);
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * Wait for all claims to sync.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * If the config cache is stale, or we have uninitialized
+ * metaslabs (see spa_vdev_add()), then update the config.
+ */
+ if (config_cache_txg != spa->spa_config_txg ||
+ state == SPA_LOAD_IMPORT)
+ need_update = B_TRUE;
+
+ for (c = 0; c < rvd->vdev_children; c++)
+ if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ need_update = B_TRUE;
+
+ /*
+ * Update the config cache asychronously in case we're the
+ * root pool, in which case the config cache isn't writable yet.
+ */
+ if (need_update)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
+
+ error = 0;
+out:
+ if (error && error != EBADF)
+ zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+ spa->spa_load_state = SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache. For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+{
+ spa_t *spa;
+ int error;
+ int loaded = B_FALSE;
+ int locked = B_FALSE;
+
+ *spapp = NULL;
+
+ /*
+ * As disgusting as this is, we need to support recursive calls to this
+ * function because dsl_dir_open() is called during spa_load(), and ends
+ * up calling spa_open() again. The real fix is to figure out how to
+ * avoid dsl_dir_open() calling this in the first place.
+ */
+ if (mutex_owner(&spa_namespace_lock) != curthread) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ if ((spa = spa_lookup(pool)) == NULL) {
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+
+ spa_activate(spa);
+
+ error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+
+ if (error == EBADF) {
+ /*
+ * If vdev_validate() returns failure (indicated by
+ * EBADF), it indicates that one of the vdevs indicates
+ * that the pool has been exported or destroyed. If
+ * this is the case, the config cache is out of sync and
+ * we should remove the pool from the namespace.
+ */
+ zfs_post_ok(spa, NULL);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ spa_config_sync();
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ if (error) {
+ /*
+ * We can't open the pool, but we still have useful
+ * information: the state of each vdev after the
+ * attempted vdev_open(). Return this to the user.
+ */
+ if (config != NULL && spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ *config = spa_config_generate(spa, NULL, -1ULL,
+ B_TRUE);
+ spa_config_exit(spa, FTAG);
+ }
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa->spa_last_open_failed = B_TRUE;
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ *spapp = NULL;
+ return (error);
+ } else {
+ zfs_post_ok(spa, NULL);
+ spa->spa_last_open_failed = B_FALSE;
+ }
+
+ loaded = B_TRUE;
+ }
+
+ spa_open_ref(spa, tag);
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+
+ *spapp = spa;
+
+ if (config != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+ }
+
+ /*
+ * If we just loaded the pool, resilver anything that's out of date.
+ */
+ if (loaded && (spa_mode & FWRITE))
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+ return (spa_open_common(name, spapp, tag, NULL));
+}
+
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+ spa_t *spa;
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (NULL);
+ }
+ spa->spa_inject_ref++;
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_inject_ref--;
+ mutex_exit(&spa_namespace_lock);
+}
+
+static void
+spa_add_spares(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_stat_t *vs;
+ uint_t vsc;
+ uint64_t pool;
+
+ if (spa->spa_nspares == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ if (nspares != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ /*
+ * Go through and find any spares which have since been
+ * repurposed as an active spare. If this is the case, update
+ * their status appropriately.
+ */
+ for (i = 0; i < nspares; i++) {
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
+ VERIFY(nvlist_lookup_uint64_array(
+ spares[i], ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ vs->vs_state = VDEV_STATE_CANT_OPEN;
+ vs->vs_aux = VDEV_AUX_SPARED;
+ }
+ }
+ }
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
+{
+ int error;
+ spa_t *spa;
+
+ *config = NULL;
+ error = spa_open_common(name, &spa, FTAG, config);
+
+ if (spa && *config != NULL) {
+ VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ spa_add_spares(spa, *config);
+ }
+
+ /*
+ * We want to get the alternate root even for faulted pools, so we cheat
+ * and call spa_lookup() directly.
+ */
+ if (altroot) {
+ if (spa == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(name);
+ if (spa)
+ spa_altroot(spa, altroot, buflen);
+ else
+ altroot[0] = '\0';
+ spa = NULL;
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ spa_altroot(spa, altroot, buflen);
+ }
+ }
+
+ if (spa != NULL)
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Validate that the 'spares' array is well formed. We must have an array of
+ * nvlists, each which describes a valid leaf vdev. If this is an import (mode
+ * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
+ * as they are well-formed.
+ */
+static int
+spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ vdev_t *vd;
+ int error;
+
+ /*
+ * It's acceptable to have no spares specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) != 0)
+ return (0);
+
+ if (nspares == 0)
+ return (EINVAL);
+
+ /*
+ * Make sure the pool is formatted with a version that supports hot
+ * spares.
+ */
+ if (spa_version(spa) < ZFS_VERSION_SPARES)
+ return (ENOTSUP);
+
+ /*
+ * Set the pending spare list so we correctly handle device in-use
+ * checking.
+ */
+ spa->spa_pending_spares = spares;
+ spa->spa_pending_nspares = nspares;
+
+ for (i = 0; i < nspares; i++) {
+ if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ mode)) != 0)
+ goto out;
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ vdev_free(vd);
+ error = EINVAL;
+ goto out;
+ }
+
+ vd->vdev_top = vd;
+
+ if ((error = vdev_open(vd)) == 0 &&
+ (error = vdev_label_init(vd, crtxg,
+ VDEV_LABEL_SPARE)) == 0) {
+ VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ }
+
+ vdev_free(vd);
+
+ if (error && mode != VDEV_ALLOC_SPARE)
+ goto out;
+ else
+ error = 0;
+ }
+
+out:
+ spa->spa_pending_spares = NULL;
+ spa->spa_pending_nspares = 0;
+ return (error);
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
+{
+ spa_t *spa;
+ vdev_t *rvd;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+ int c, error = 0;
+ uint64_t txg = TXG_INITIAL;
+ nvlist_t **spares;
+ uint_t nspares;
+
+ /*
+ * If this pool already exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Allocate a new spa_t structure.
+ */
+ spa = spa_add(pool, altroot);
+ spa_activate(spa);
+
+ spa->spa_uberblock.ub_txg = txg - 1;
+ spa->spa_uberblock.ub_version = ZFS_VERSION;
+ spa->spa_ubsync = spa->spa_uberblock;
+
+ /*
+ * Create the root vdev.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ ASSERT(error != 0 || rvd != NULL);
+ ASSERT(error != 0 || spa->spa_root_vdev == rvd);
+
+ if (error == 0 && rvd->vdev_children == 0)
+ error = EINVAL;
+
+ if (error == 0 &&
+ (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+ (error = spa_validate_spares(spa, nvroot, txg,
+ VDEV_ALLOC_ADD)) == 0) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_init(rvd->vdev_child[c], txg);
+ vdev_config_dirty(rvd);
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Get the list of spares, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_sync_spares = B_TRUE;
+ }
+
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+ spa->spa_meta_objset = dp->dp_meta_objset;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Create the pool config object.
+ */
+ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool config");
+ }
+
+ /* Newly created pools are always deflated. */
+ spa->spa_deflate = TRUE;
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add deflate");
+ }
+
+ /*
+ * Create the deferred-free bplist object. Turn off compression
+ * because sync-to-convergence takes longer if the blocksize
+ * keeps changing.
+ */
+ spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+ 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+ ZIO_COMPRESS_OFF, tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bplist");
+ }
+
+ /*
+ * Create the pool's history object.
+ */
+ spa_history_create_obj(spa, tx);
+
+ dmu_tx_commit(tx);
+
+ spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * We explicitly wait for the first transaction to complete so that our
+ * bean counters are appropriately updated.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Import the given pool into the system. We set up the necessary spa_t and
+ * then call spa_load() to do the dirty work.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, const char *altroot)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvroot;
+ nvlist_t **spares;
+ uint_t nspares;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ /*
+ * If a pool with this name exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ spa = spa_add(pool, altroot);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load().
+ * Pass TRUE for mosconfig because the user-supplied config
+ * is actually the one to trust when doing an import.
+ */
+ error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ /*
+ * Toss any existing sparelist, as it doesn't have any validity anymore,
+ * and conflicts with spa_has_spare().
+ */
+ if (spa->spa_sparelist) {
+ nvlist_free(spa->spa_sparelist);
+ spa->spa_sparelist = NULL;
+ spa_load_spares(spa);
+ }
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (error == 0)
+ error = spa_validate_spares(spa, nvroot, -1ULL,
+ VDEV_ALLOC_SPARE);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Override any spares as specified by the user, as these may have
+ * correct device names/devids, etc.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ if (spa->spa_sparelist)
+ VERIFY(nvlist_remove(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_sparelist,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_sync_spares = B_TRUE;
+ }
+
+ /*
+ * Update the config cache to include the newly-imported pool.
+ */
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Resilver anything that's out of date.
+ */
+ if (spa_mode & FWRITE)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+ nvlist_t *config = NULL;
+ char *poolname;
+ spa_t *spa;
+ uint64_t state;
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+ return (NULL);
+
+ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+ return (NULL);
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_add(TRYIMPORT_NAME, NULL);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load().
+ * Pass TRUE for mosconfig because the user-supplied config
+ * is actually the one to trust when doing an import.
+ */
+ (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+
+ /*
+ * If 'tryconfig' was at least parsable, return the current config.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ poolname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ state) == 0);
+
+ /*
+ * Add the list of hot spares.
+ */
+ spa_add_spares(spa, config);
+ }
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+
+ return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple. We make sure there
+ * is no more pending I/O and any references to the pool are gone. Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards.
+ */
+static int
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
+{
+ spa_t *spa;
+
+ if (oldconfig)
+ *oldconfig = NULL;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pool)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ /*
+ * Put a hold on the pool, drop the namespace lock, stop async tasks,
+ * reacquire the namespace lock, and see if we can export.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ /*
+ * The pool will be in core if it's openable,
+ * in which case we can modify its state.
+ */
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+ /*
+ * Objsets may be open only because they're dirty, so we
+ * have to force it to sync before checking spa_refcnt.
+ */
+ spa_scrub_suspend(spa);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * A pool cannot be exported or destroyed if there are active
+ * references. If we are resetting a pool, allow references by
+ * fault injection handlers.
+ */
+ if (!spa_refcount_zero(spa) ||
+ (spa->spa_inject_ref != 0 &&
+ new_state != POOL_STATE_UNINITIALIZED)) {
+ spa_scrub_resume(spa);
+ spa_async_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (EBUSY);
+ }
+
+ spa_scrub_resume(spa);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+
+ /*
+ * We want this to be reflected on every label,
+ * so mark them all dirty. spa_unload() will do the
+ * final sync that pushes these changes out.
+ */
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa->spa_state = new_state;
+ spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, FTAG);
+ }
+ }
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+
+ if (oldconfig && spa->spa_config)
+ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_remove(spa);
+ spa_config_sync();
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool, nvlist_t **oldconfig)
+{
+ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
+}
+
+/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
+}
+
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add capacity to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+ uint64_t txg;
+ int c, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *tvd;
+ nvlist_t **spares;
+ uint_t i, nspares;
+
+ txg = spa_vdev_enter(spa);
+
+ if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ spa->spa_pending_vdev = vd;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) != 0)
+ nspares = 0;
+
+ if (vd->vdev_children == 0 && nspares == 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+
+ if (vd->vdev_children != 0) {
+ if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+ }
+
+ /*
+ * We must validate the spares after checking the children. Otherwise,
+ * vdev_inuse() will blindly overwrite the spare.
+ */
+ if ((error = spa_validate_spares(spa, nvroot, txg,
+ VDEV_ALLOC_ADD)) != 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ spa->spa_pending_vdev = NULL;
+
+ /*
+ * Transfer each new top-level vdev from vd to rvd.
+ */
+ for (c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ vdev_remove_child(vd, tvd);
+ tvd->vdev_id = rvd->vdev_children;
+ vdev_add_child(rvd, tvd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (nspares != 0) {
+ if (spa->spa_sparelist != NULL) {
+ nvlist_t **oldspares;
+ uint_t oldnspares;
+ nvlist_t **newspares;
+
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
+
+ newspares = kmem_alloc(sizeof (void *) *
+ (nspares + oldnspares), KM_SLEEP);
+ for (i = 0; i < oldnspares; i++)
+ VERIFY(nvlist_dup(oldspares[i],
+ &newspares[i], KM_SLEEP) == 0);
+ for (i = 0; i < nspares; i++)
+ VERIFY(nvlist_dup(spares[i],
+ &newspares[i + oldnspares],
+ KM_SLEEP) == 0);
+
+ VERIFY(nvlist_remove(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, newspares,
+ nspares + oldnspares) == 0);
+ for (i = 0; i < oldnspares + nspares; i++)
+ nvlist_free(newspares[i]);
+ kmem_free(newspares, (oldnspares + nspares) *
+ sizeof (void *));
+ } else {
+ VERIFY(nvlist_alloc(&spa->spa_sparelist,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ }
+
+ spa_load_spares(spa);
+ spa->spa_sync_spares = B_TRUE;
+ }
+
+ /*
+ * We have to be careful when adding new vdevs to an existing pool.
+ * If other threads start allocating from these vdevs before we
+ * sync the config cache, and we lose power, then upon reboot we may
+ * fail to open the pool because there are DVAs that the config cache
+ * can't translate. Therefore, we first add the vdevs without
+ * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+ * and then let spa_config_update() initialize the new metaslabs.
+ *
+ * spa_load() checks for added-but-not-initialized vdevs, so that
+ * if we lose power at any point in this sequence, the remaining
+ * steps will be completed the next time we load the pool.
+ */
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Attach a device to a mirror. The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device. If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally idendical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+{
+ uint64_t txg, open_txg;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+ vdev_ops_t *pvops;
+
+ txg = spa_vdev_enter(spa);
+
+ oldvd = vdev_lookup_by_guid(rvd, guid);
+
+ if (oldvd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!oldvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = oldvd->vdev_parent;
+
+ if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ newvd = newrootvd->vdev_child[0];
+
+ if (!newvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+ if (!replacing) {
+ /*
+ * For attach, the only allowable parent is a mirror or the root
+ * vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ pvops = &vdev_mirror_ops;
+ } else {
+ /*
+ * Active hot spares can only be replaced by inactive hot
+ * spares.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ pvd->vdev_child[1] == oldvd &&
+ !spa_has_spare(spa, newvd->vdev_guid))
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * If the source is a hot spare, and the parent isn't already a
+ * spare, then we want to create a new hot spare. Otherwise, we
+ * want to create a replacing vdev. The user is not allowed to
+ * attach to a spared vdev child unless the 'isspare' state is
+ * the same (spare replaces spare, non-spare replaces
+ * non-spare).
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ else if (pvd->vdev_ops != &vdev_spare_ops &&
+ newvd->vdev_isspare)
+ pvops = &vdev_spare_ops;
+ else
+ pvops = &vdev_replacing_ops;
+ }
+
+ /*
+ * Compare the new device size with the replaceable/attachable
+ * device size.
+ */
+ if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+ /*
+ * The new device cannot have a higher alignment requirement
+ * than the top-level vdev.
+ */
+ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
+ return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+ /*
+ * If this is an in-place replacement, update oldvd's path and devid
+ * to make it distinguishable from newvd, and unopenable from now on.
+ */
+ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ spa_strfree(oldvd->vdev_path);
+ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ KM_SLEEP);
+ (void) sprintf(oldvd->vdev_path, "%s/%s",
+ newvd->vdev_path, "old");
+ if (oldvd->vdev_devid != NULL) {
+ spa_strfree(oldvd->vdev_devid);
+ oldvd->vdev_devid = NULL;
+ }
+ }
+
+ /*
+ * If the parent is not a mirror, or if we're replacing, insert the new
+ * mirror/replacing/spare vdev above oldvd.
+ */
+ if (pvd->vdev_ops != pvops)
+ pvd = vdev_add_parent(oldvd, pvops);
+
+ ASSERT(pvd->vdev_top->vdev_parent == rvd);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+
+ /*
+ * Extract the new device from its root and add it to pvd.
+ */
+ vdev_remove_child(newrootvd, newvd);
+ newvd->vdev_id = pvd->vdev_children;
+ vdev_add_child(pvd, newvd);
+
+ /*
+ * If newvd is smaller than oldvd, but larger than its rsize,
+ * the addition of newvd may have decreased our parent's asize.
+ */
+ pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
+ tvd = newvd->vdev_top;
+ ASSERT(pvd->vdev_top == tvd);
+ ASSERT(tvd->vdev_parent == rvd);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
+ * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ */
+ open_txg = txg + TXG_CONCURRENT_STATES - 1;
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
+ open_txg - TXG_INITIAL + 1);
+ mutex_exit(&newvd->vdev_dtl_lock);
+
+ if (newvd->vdev_isspare)
+ spa_spare_activate(newvd);
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+ (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+
+ /*
+ * Kick off a resilver to update newvd.
+ */
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+{
+ uint64_t txg;
+ int c, t, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *pvd, *cvd, *tvd;
+ boolean_t unspare = B_FALSE;
+ uint64_t unspare_guid;
+
+ txg = spa_vdev_enter(spa);
+
+ vd = vdev_lookup_by_guid(rvd, guid);
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If replace_done is specified, only remove this device if it's
+ * the first child of a replacing vdev. For the 'spare' vdev, either
+ * disk can be removed.
+ */
+ if (replace_done) {
+ if (pvd->vdev_ops == &vdev_replacing_ops) {
+ if (vd->vdev_id != 0)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ } else if (pvd->vdev_ops != &vdev_spare_ops) {
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
+ }
+
+ ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
+ spa_version(spa) >= ZFS_VERSION_SPARES);
+
+ /*
+ * Only mirror, replacing, and spare vdevs support detach.
+ */
+ if (pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If there's only one replica, you can't detach it.
+ */
+ if (pvd->vdev_children <= 1)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If all siblings have non-empty DTLs, this device may have the only
+ * valid copy of the data, which means we cannot safely detach it.
+ *
+ * XXX -- as in the vdev_offline() case, we really want a more
+ * precise DTL check.
+ */
+ for (c = 0; c < pvd->vdev_children; c++) {
+ uint64_t dirty;
+
+ cvd = pvd->vdev_child[c];
+ if (cvd == vd)
+ continue;
+ if (vdev_is_dead(cvd))
+ continue;
+ mutex_enter(&cvd->vdev_dtl_lock);
+ dirty = cvd->vdev_dtl_map.sm_space |
+ cvd->vdev_dtl_scrub.sm_space;
+ mutex_exit(&cvd->vdev_dtl_lock);
+ if (!dirty)
+ break;
+ }
+
+ /*
+ * If we are a replacing or spare vdev, then we can always detach the
+ * latter child, as that is how one cancels the operation.
+ */
+ if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
+ c == pvd->vdev_children)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If we are detaching the original disk from a spare, then it implies
+ * that the spare should become a real disk, and be removed from the
+ * active spare list for the pool.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_id == 0)
+ unspare = B_TRUE;
+
+ /*
+ * Erase the disk labels so the disk can be used for other things.
+ * This must be done after all other error cases are handled,
+ * but before we disembowel vd (so we can still do I/O to it).
+ * But if we can't do it, don't treat the error as fatal --
+ * it may be that the unwritability of the disk is the reason
+ * it's being detached!
+ */
+ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ /*
+ * Remove vd from its parent and compact the parent's children.
+ */
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ /*
+ * Remember one of the remaining children so we can get tvd below.
+ */
+ cvd = pvd->vdev_child[0];
+
+ /*
+ * If we need to remove the remaining child from the list of hot spares,
+ * do it now, marking the vdev as no longer a spare in the process. We
+ * must do this before vdev_remove_parent(), because that can change the
+ * GUID if it creates a new toplevel GUID.
+ */
+ if (unspare) {
+ ASSERT(cvd->vdev_isspare);
+ spa_spare_remove(cvd);
+ unspare_guid = cvd->vdev_guid;
+ }
+
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1)
+ vdev_remove_parent(cvd);
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(cvd->vdev_parent);
+
+ /*
+ * If the device we just detached was smaller than the others, it may be
+ * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
+ * can't fail because the existing metaslabs are already in core, so
+ * there's nothing to read from disk.
+ */
+ VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
+ * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list, to
+ * prevent vd from being accessed after it's freed.
+ */
+ for (t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ vd->vdev_detached = B_TRUE;
+ vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+ error = spa_vdev_exit(spa, vd, txg, 0);
+
+ /*
+ * If this was the removal of the original device in a hot spare vdev,
+ * then we want to go through and remove the device from the hot spare
+ * list of every other pool.
+ */
+ if (unspare) {
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa->spa_state != POOL_STATE_ACTIVE)
+ continue;
+
+ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ }
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+/*
+ * Remove a device from the pool. Currently, this supports removing only hot
+ * spares.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, *nv, **newspares;
+ uint_t i, j, nspares;
+ int ret = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ vd = spa_lookup_by_guid(spa, guid);
+
+ nv = NULL;
+ if (spa->spa_spares != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
+
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ nv = spares[i];
+ break;
+ }
+ }
+ }
+
+ /*
+ * We only support removing a hot spare, and only if it's not currently
+ * in use in this pool.
+ */
+ if (nv == NULL && vd == NULL) {
+ ret = ENOENT;
+ goto out;
+ }
+
+ if (nv == NULL && vd != NULL) {
+ ret = ENOTSUP;
+ goto out;
+ }
+
+ if (!unspare && nv != NULL && vd != NULL) {
+ ret = EBUSY;
+ goto out;
+ }
+
+ if (nspares == 1) {
+ newspares = NULL;
+ } else {
+ newspares = kmem_alloc((nspares - 1) * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0, j = 0; i < nspares; i++) {
+ if (spares[i] != nv)
+ VERIFY(nvlist_dup(spares[i],
+ &newspares[j++], KM_SLEEP) == 0);
+ }
+ }
+
+ VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ newspares, nspares - 1) == 0);
+ for (i = 0; i < nspares - 1; i++)
+ nvlist_free(newspares[i]);
+ kmem_free(newspares, (nspares - 1) * sizeof (void *));
+ spa_load_spares(spa);
+ spa->spa_sync_spares = B_TRUE;
+
+out:
+ spa_config_exit(spa, FTAG);
+
+ return (ret);
+}
+
+/*
+ * Find any device that's done replacing, so we can detach it.
+ */
+static vdev_t *
+spa_vdev_replace_done_hunt(vdev_t *vd)
+{
+ vdev_t *newvd, *oldvd;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+ if (oldvd != NULL)
+ return (oldvd);
+ }
+
+ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ oldvd = vd->vdev_child[0];
+ newvd = vd->vdev_child[1];
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ if (newvd->vdev_dtl_map.sm_space == 0 &&
+ newvd->vdev_dtl_scrub.sm_space == 0) {
+ mutex_exit(&newvd->vdev_dtl_lock);
+ return (oldvd);
+ }
+ mutex_exit(&newvd->vdev_dtl_lock);
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_replace_done(spa_t *spa)
+{
+ vdev_t *vd;
+ vdev_t *pvd;
+ uint64_t guid;
+ uint64_t pguid = 0;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+ guid = vd->vdev_guid;
+ /*
+ * If we have just finished replacing a hot spared device, then
+ * we need to detach the parent's first child (the original hot
+ * spare) as well.
+ */
+ pvd = vd->vdev_parent;
+ if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ pvd->vdev_id == 0) {
+ ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
+ ASSERT(pvd->vdev_parent->vdev_children == 2);
+ pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+ }
+ spa_config_exit(spa, FTAG);
+ if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+ return;
+ if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+ return;
+ spa_config_enter(spa, RW_READER, FTAG);
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
+/*
+ * Update the stored path for this vdev. Dirty the vdev configuration, relying
+ * on spa_vdev_enter/exit() to synchronize the labels and cache.
+ */
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ rvd = spa->spa_root_vdev;
+
+ txg = spa_vdev_enter(spa);
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ /*
+ * Determine if this is a reference to a hot spare. In that
+ * case, update the path as stored in the spare list.
+ */
+ nvlist_t **spares;
+ uint_t i, nspares;
+ if (spa->spa_sparelist != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid)
+ break;
+ }
+
+ if (i == nspares)
+ return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+
+ VERIFY(nvlist_add_string(spares[i],
+ ZPOOL_CONFIG_PATH, newpath) == 0);
+ spa_load_spares(spa);
+ spa->spa_sync_spares = B_TRUE;
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+ } else {
+ return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(newpath);
+
+ vdev_config_dirty(vd->vdev_top);
+
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+}
+
+/*
+ * ==========================================================================
+ * SPA Scrubbing
+ * ==========================================================================
+ */
+
+static void
+spa_scrub_io_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
+ spa->spa_scrub_errors++;
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ ASSERT(spa->spa_scrub_inflight >= 0);
+
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+ zbookmark_t *zb)
+{
+ size_t size = BP_GET_LSIZE(bp);
+ void *data;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ /*
+ * Do not give too much work to vdev(s).
+ */
+ while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ }
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ data = zio_data_buf_alloc(size);
+
+ if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */
+
+ flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ spa_scrub_io_done, NULL, priority, flags, zb));
+}
+
+/* ARGSUSED */
+static int
+spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+ blkptr_t *bp = &bc->bc_blkptr;
+ vdev_t *vd = spa->spa_root_vdev;
+ dva_t *dva = bp->blk_dva;
+ int needs_resilver = B_FALSE;
+ int d;
+
+ if (bc->bc_errno) {
+ /*
+ * We can't scrub this block, but we can continue to scrub
+ * the rest of the pool. Note the error and move along.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ return (ERESTART);
+ }
+
+ ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
+
+ ASSERT(vd != NULL);
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+ if (DVA_GET_GANG(&dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best we can do is look at the
+ * pool-wide DTL.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that this can't
+ * happen.
+ */
+ vd = spa->spa_root_vdev;
+ }
+ if (vdev_dtl_contains(&vd->vdev_dtl_map,
+ bp->blk_birth, 1))
+ needs_resilver = B_TRUE;
+ }
+ }
+
+ if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SCRUB, &bc->bc_bookmark);
+ else if (needs_resilver)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+ ZIO_FLAG_RESILVER, &bc->bc_bookmark);
+
+ return (0);
+}
+
+static void
+spa_scrub_thread(void *arg)
+{
+ spa_t *spa = arg;
+ callb_cpr_t cprinfo;
+ traverse_handle_t *th = spa->spa_scrub_th;
+ vdev_t *rvd = spa->spa_root_vdev;
+ pool_scrub_type_t scrub_type = spa->spa_scrub_type;
+ int error = 0;
+ boolean_t complete;
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
+
+ /*
+ * If we're restarting due to a snapshot create/delete,
+ * wait for that to complete.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_reopen(rvd); /* purge all vdev caches */
+ vdev_config_dirty(rvd); /* rewrite all disk labels */
+ vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
+ spa_config_exit(spa, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors = 0;
+ spa->spa_scrub_active = 1;
+ ASSERT(spa->spa_scrub_inflight == 0);
+
+ while (!spa->spa_scrub_stop) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_scrub_suspended) {
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_active = 1;
+ }
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
+
+ if (spa->spa_scrub_restart_txg != 0)
+ break;
+
+ mutex_exit(&spa->spa_scrub_lock);
+ error = traverse_more(th);
+ mutex_enter(&spa->spa_scrub_lock);
+ if (error != EAGAIN)
+ break;
+ }
+
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
+ * AND the spa config lock to synchronize with any config changes
+ * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
+ */
+ if (spa->spa_scrub_restart_txg != 0)
+ error = ERESTART;
+
+ if (spa->spa_scrub_stop)
+ error = EINTR;
+
+ /*
+ * Even if there were uncorrectable errors, we consider the scrub
+ * completed. The downside is that if there is a transient error during
+ * a resilver, we won't resilver the data properly to the target. But
+ * if the damage is permanent (more likely) we will resilver forever,
+ * which isn't really acceptable. Since there is enough information for
+ * the user to know what has failed and why, this seems like a more
+ * tractable approach.
+ */
+ complete = (error == 0);
+
+ dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
+ error, spa->spa_scrub_errors, spa->spa_scrub_stop);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to reflect this.
+ * Whether it succeeded or not, vacate all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
+ complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
+ vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
+ spa_errlog_rotate(spa);
+
+ spa_config_exit(spa, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+
+ /*
+ * If we were told to restart, our final act is to start a new scrub.
+ */
+ if (error == ERESTART)
+ spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+ SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
+
+ spa->spa_scrub_type = POOL_SCRUB_NONE;
+ spa->spa_scrub_active = 0;
+ spa->spa_scrub_thread = NULL;
+ cv_broadcast(&spa->spa_scrub_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */
+ thread_exit();
+}
+
+void
+spa_scrub_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_suspended++;
+ while (spa->spa_scrub_active) {
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT(spa->spa_scrub_suspended != 0);
+ if (--spa->spa_scrub_suspended == 0)
+ cv_broadcast(&spa->spa_scrub_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_restart(spa_t *spa, uint64_t txg)
+{
+ /*
+ * Something happened (e.g. snapshot create/delete) that means
+ * we must restart any in-progress scrubs. The itinerary will
+ * fix this properly.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_restart_txg = txg;
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+ space_seg_t *ss;
+ uint64_t mintxg, maxtxg;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if ((uint_t)type >= POOL_SCRUB_TYPES)
+ return (ENOTSUP);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * If there's a scrub or resilver already in progress, stop it.
+ */
+ while (spa->spa_scrub_thread != NULL) {
+ /*
+ * Don't stop a resilver unless forced.
+ */
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+ mutex_exit(&spa->spa_scrub_lock);
+ return (EBUSY);
+ }
+ spa->spa_scrub_stop = 1;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+
+ /*
+ * Terminate the previous traverse.
+ */
+ if (spa->spa_scrub_th != NULL) {
+ traverse_fini(spa->spa_scrub_th);
+ spa->spa_scrub_th = NULL;
+ }
+
+ if (rvd == NULL) {
+ ASSERT(spa->spa_scrub_stop == 0);
+ ASSERT(spa->spa_scrub_type == type);
+ ASSERT(spa->spa_scrub_restart_txg == 0);
+ mutex_exit(&spa->spa_scrub_lock);
+ return (0);
+ }
+
+ mintxg = TXG_INITIAL - 1;
+ maxtxg = spa_last_synced_txg(spa) + 1;
+
+ mutex_enter(&rvd->vdev_dtl_lock);
+
+ if (rvd->vdev_dtl_map.sm_space == 0) {
+ /*
+ * The pool-wide DTL is empty.
+ * If this is a resilver, there's nothing to do except
+ * check whether any in-progress replacements have completed.
+ */
+ if (type == POOL_SCRUB_RESILVER) {
+ type = POOL_SCRUB_NONE;
+ spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+ }
+ } else {
+ /*
+ * The pool-wide DTL is non-empty.
+ * If this is a normal scrub, upgrade to a resilver instead.
+ */
+ if (type == POOL_SCRUB_EVERYTHING)
+ type = POOL_SCRUB_RESILVER;
+ }
+
+ if (type == POOL_SCRUB_RESILVER) {
+ /*
+ * Determine the resilvering boundaries.
+ *
+ * Note: (mintxg, maxtxg) is an open interval,
+ * i.e. mintxg and maxtxg themselves are not included.
+ *
+ * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
+ * so we don't claim to resilver a txg that's still changing.
+ */
+ ss = avl_first(&rvd->vdev_dtl_map.sm_root);
+ mintxg = ss->ss_start - 1;
+ ss = avl_last(&rvd->vdev_dtl_map.sm_root);
+ maxtxg = MIN(ss->ss_end, maxtxg);
+ }
+
+ mutex_exit(&rvd->vdev_dtl_lock);
+
+ spa->spa_scrub_stop = 0;
+ spa->spa_scrub_type = type;
+ spa->spa_scrub_restart_txg = 0;
+
+ if (type != POOL_SCRUB_NONE) {
+ spa->spa_scrub_mintxg = mintxg;
+ spa->spa_scrub_maxtxg = maxtxg;
+ spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
+ ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
+ ZIO_FLAG_CANFAIL);
+ traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
+ spa->spa_scrub_thread = thread_create(NULL, 0,
+ spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+ }
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ return (0);
+}
+
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_reopen(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd;
+ int c;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ tvd = rvd->vdev_child[c];
+ if (tvd->vdev_reopen_wanted) {
+ tvd->vdev_reopen_wanted = 0;
+ vdev_reopen(tvd);
+ }
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
+static void
+spa_async_thread(void *arg)
+{
+ spa_t *spa = arg;
+ int tasks;
+
+ ASSERT(spa->spa_sync_on);
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ spa->spa_async_tasks = 0;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if the config needs to be updated.
+ */
+ if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * See if any devices need to be reopened.
+ */
+ if (tasks & SPA_ASYNC_REOPEN)
+ spa_async_reopen(spa);
+
+ /*
+ * If any devices are done replacing, detach them.
+ */
+ if (tasks & SPA_ASYNC_REPLACE_DONE)
+ spa_vdev_replace_done(spa);
+
+ /*
+ * Kick off a scrub.
+ */
+ if (tasks & SPA_ASYNC_SCRUB)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+
+ /*
+ * Kick off a resilver.
+ */
+ if (tasks & SPA_ASYNC_RESILVER)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_suspended++;
+ while (spa->spa_async_thread != NULL)
+ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ ASSERT(spa->spa_async_suspended != 0);
+ spa->spa_async_suspended--;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+ spa->spa_async_thread == NULL &&
+ rootdir != NULL && !vn_is_readonly(rootdir))
+ spa->spa_async_thread = thread_create(NULL, 0,
+ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks |= task;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static void
+spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+{
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ dmu_tx_t *tx;
+ blkptr_t blk;
+ uint64_t itor = 0;
+ zio_t *zio;
+ int error;
+ uint8_t c = 1;
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+
+ while (bplist_iterate(bpl, &itor, &blk) == 0)
+ zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+
+ error = zio_wait(zio);
+ ASSERT3U(error, ==, 0);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ bplist_vacate(bpl, tx);
+
+ /*
+ * Pre-dirty the first block so we sync to convergence faster.
+ * (Usually only the first block is needed.)
+ */
+ dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
+{
+ char *packed = NULL;
+ size_t nvsize = 0;
+ dmu_buf_t *db;
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+
+ VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+
+ dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
+
+ kmem_free(packed, nvsize);
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = nvsize;
+ dmu_buf_rele(db, FTAG);
+}
+
+static void
+spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *nvroot;
+ nvlist_t **spares;
+ int i;
+
+ if (!spa->spa_sync_spares)
+ return;
+
+ /*
+ * Update the MOS nvlist describing the list of available spares.
+ * spa_validate_spares() will have already made sure this nvlist is
+ * valid and the vdevs are labelled appropriately.
+ */
+ if (spa->spa_spares_object == 0) {
+ spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+ VERIFY(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
+ sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (spa->spa_nspares == 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ NULL, 0) == 0);
+ } else {
+ spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_nspares; i++)
+ spares[i] = vdev_config_generate(spa,
+ spa->spa_spares[i], B_FALSE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ spares, spa->spa_nspares) == 0);
+ for (i = 0; i < spa->spa_nspares; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_nspares * sizeof (void *));
+ }
+
+ spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
+ nvlist_free(nvroot);
+
+ spa->spa_sync_spares = B_FALSE;
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *config;
+
+ if (list_is_empty(&spa->spa_dirty_list))
+ return;
+
+ config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+
+ if (spa->spa_config_syncing)
+ nvlist_free(spa->spa_config_syncing);
+ spa->spa_config_syncing = config;
+
+ spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
+}
+
+static void
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ spa_t *spa = arg1;
+ nvlist_t *nvp = arg2;
+ nvpair_t *nvpair;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t zapobj;
+
+ mutex_enter(&spa->spa_props_lock);
+ if (spa->spa_pool_props_object == 0) {
+ zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
+ VERIFY(zapobj > 0);
+
+ spa->spa_pool_props_object = zapobj;
+
+ VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_PROPS, 8, 1,
+ &spa->spa_pool_props_object, tx) == 0);
+ }
+ mutex_exit(&spa->spa_props_lock);
+
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
+ switch (zpool_name_to_prop(nvpair_name(nvpair))) {
+ case ZFS_PROP_BOOTFS:
+ VERIFY(nvlist_lookup_uint64(nvp,
+ nvpair_name(nvpair), &spa->spa_bootfs) == 0);
+ VERIFY(zap_update(mos,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
+ &spa->spa_bootfs, tx) == 0);
+ break;
+ }
+ }
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ objset_t *mos = spa->spa_meta_objset;
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ dmu_tx_t *tx;
+ int dirty_vdevs;
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
+ int i;
+
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY(0 == zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ /*
+ * If anything has changed in this txg, push the deferred frees
+ * from the previous txg. If not, leave them alone so that we
+ * don't generate work on an otherwise idle system.
+ */
+ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
+ !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
+ !txg_list_empty(&dp->dp_sync_tasks, txg))
+ spa_sync_deferred_frees(spa, txg);
+
+ /*
+ * Iterate to convergence.
+ */
+ do {
+ spa->spa_sync_pass++;
+
+ spa_sync_config_object(spa, tx);
+ spa_sync_spares(spa, tx);
+ spa_errlog_sync(spa, txg);
+ dsl_pool_sync(dp, txg);
+
+ dirty_vdevs = 0;
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
+ vdev_sync(vd, txg);
+ dirty_vdevs++;
+ }
+
+ bplist_sync(bpl, tx);
+ } while (dirty_vdevs);
+
+ bplist_close(bpl);
+
+ dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+
+ /*
+ * Rewrite the vdev configuration (which includes the uberblock)
+ * to commit the transaction group.
+ *
+ * If there are any dirty vdevs, sync the uberblock to all vdevs.
+ * Otherwise, pick a random top-level vdev that's known to be
+ * visible in the config cache (see spa_vdev_add() for details).
+ * If the write fails, try the next vdev until we're tried them all.
+ */
+ if (!list_is_empty(&spa->spa_dirty_list)) {
+ VERIFY(vdev_config_sync(rvd, txg) == 0);
+ } else {
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+ int c;
+
+ for (c = 0; c < children; c++) {
+ vd = rvd->vdev_child[(c0 + c) % children];
+ if (vd->vdev_ms_array == 0)
+ continue;
+ if (vdev_config_sync(vd, txg) == 0)
+ break;
+ }
+ if (c == children)
+ VERIFY(vdev_config_sync(rvd, txg) == 0);
+ }
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear the dirty config list.
+ */
+ while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
+ vdev_config_clean(vd);
+
+ /*
+ * Now that the new config has synced transactionally,
+ * let it become visible to the config cache.
+ */
+ if (spa->spa_config_syncing != NULL) {
+ spa_config_set(spa, spa->spa_config_syncing);
+ spa->spa_config_txg = txg;
+ spa->spa_config_syncing = NULL;
+ }
+
+ /*
+ * Make a stable copy of the fully synced uberblock.
+ * We use this as the root for pool traversals.
+ */
+ spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */
+
+ spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */
+
+ rw_enter(&spa->spa_traverse_lock, RW_WRITER);
+ spa->spa_traverse_wanted = 0;
+ spa->spa_ubsync = spa->spa_uberblock;
+ rw_exit(&spa->spa_traverse_lock);
+
+ spa_scrub_resume(spa); /* resume scrub with new ubsync */
+
+ /*
+ * Clean up the ZIL records for the synced txg.
+ */
+ dsl_pool_zil_clean(dp);
+
+ /*
+ * Update usable space statistics.
+ */
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ vdev_sync_done(vd, txg);
+
+ /*
+ * It had better be the case that we didn't dirty anything
+ * since vdev_config_sync().
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+ ASSERT(bpl->bpl_queue == NULL);
+
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * If any async tasks have been requested, kick them off.
+ */
+ spa_async_dispatch(spa);
+}
+
+/*
+ * Sync all pools. We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+ spa_t *spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE)
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+ spa_t *spa;
+
+ /*
+ * Remove all cached state. All pools should be closed now,
+ * so every spa in the AVL tree should be unreferenced.
+ */
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(NULL)) != NULL) {
+ /*
+ * Stop async tasks. The async thread may need to detach
+ * a device that's been replaced, which requires grabbing
+ * spa_namespace_lock, so we must drop it here.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+ return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
+
+void
+spa_upgrade(spa_t *spa)
+{
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ /*
+ * This should only be called for a non-faulted pool, and since a
+ * future version would result in an unopenable pool, this shouldn't be
+ * possible.
+ */
+ ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
+
+ spa->spa_uberblock.ub_version = ZFS_VERSION;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, FTAG);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+}
+
+boolean_t
+spa_has_spare(spa_t *spa, uint64_t guid)
+{
+ int i;
+ uint64_t spareguid;
+
+ for (i = 0; i < spa->spa_nspares; i++)
+ if (spa->spa_spares[i]->vdev_guid == guid)
+ return (B_TRUE);
+
+ for (i = 0; i < spa->spa_pending_nspares; i++) {
+ if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
+ ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
+ spareguid == guid)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+int
+spa_set_props(spa_t *spa, nvlist_t *nvp)
+{
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+ spa, nvp, 3));
+}
+
+int
+spa_get_props(spa_t *spa, nvlist_t **nvp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ objset_t *mos = spa->spa_meta_objset;
+ zfs_source_t src;
+ zfs_prop_t prop;
+ nvlist_t *propval;
+ uint64_t value;
+ int err;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_props_lock);
+ /* If no props object, then just return empty nvlist */
+ if (spa->spa_pool_props_object == 0) {
+ mutex_exit(&spa->spa_props_lock);
+ return (0);
+ }
+
+ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ switch (za.za_integer_length) {
+ case 8:
+ if (zfs_prop_default_numeric(prop) ==
+ za.za_first_integer)
+ src = ZFS_SRC_DEFAULT;
+ else
+ src = ZFS_SRC_LOCAL;
+ value = za.za_first_integer;
+
+ if (prop == ZFS_PROP_BOOTFS) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+ char strval[MAXPATHLEN];
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if ((err = dsl_dataset_open_obj(dp,
+ za.za_first_integer, NULL, DS_MODE_NONE,
+ FTAG, &ds)) != 0) {
+ rw_exit(&dp->dp_config_rwlock);
+ break;
+ }
+ dsl_dataset_name(ds, strval);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+
+ VERIFY(nvlist_add_uint64(propval,
+ ZFS_PROP_SOURCE, src) == 0);
+ VERIFY(nvlist_add_string(propval,
+ ZFS_PROP_VALUE, strval) == 0);
+ } else {
+ VERIFY(nvlist_add_uint64(propval,
+ ZFS_PROP_SOURCE, src) == 0);
+ VERIFY(nvlist_add_uint64(propval,
+ ZFS_PROP_VALUE, value) == 0);
+ }
+ VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
+ propval) == 0);
+ break;
+ }
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+ mutex_exit(&spa->spa_props_lock);
+ if (err && err != ENOENT) {
+ nvlist_free(*nvp);
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+ VERIFY(zap_remove(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
+ spa->spa_bootfs = 0;
+ }
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
new file mode 100644
index 000000000000..b5d8c38d701b
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -0,0 +1,361 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#endif
+
+/*
+ * Pool configuration repository.
+ *
+ * The configuration for all pools, in addition to being stored on disk, is
+ * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains
+ * this list as pools are created, destroyed, or modified.
+ *
+ * We have a single nvlist which holds all the configuration information. When
+ * the module loads, we read this information from the cache and populate the
+ * SPA namespace. This namespace is maintained independently in spa.c.
+ * Whenever the namespace is modified, or the configuration of a pool is
+ * changed, we call spa_config_sync(), which walks through all the active pools
+ * and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_dir = ZPOOL_CACHE_DIR;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace. It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+ void *buf = NULL;
+ nvlist_t *nvlist, *child;
+ nvpair_t *nvpair;
+ spa_t *spa;
+ char pathname[128];
+ struct _buf *file;
+ uint64_t fsize;
+
+ root_mount_wait();
+
+ /*
+ * Open the configuration file.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s/%s",
+ spa_config_dir, ZPOOL_CACHE_FILE);
+
+ file = kobj_open_file(pathname);
+ if (file == (struct _buf *)-1)
+ return;
+
+ if (kobj_get_filesize(file, &fsize) != 0)
+ goto out;
+
+ buf = kmem_alloc(fsize, KM_SLEEP);
+
+ /*
+ * Read the nvlist from the file.
+ */
+ if (kobj_read_file(file, buf, fsize, 0) < 0)
+ goto out;
+
+ /*
+ * Unpack the nvlist.
+ */
+ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
+ goto out;
+
+ /*
+ * Iterate over all elements in the nvlist, creating a new spa_t for
+ * each one with the specified configuration.
+ */
+ mutex_enter(&spa_namespace_lock);
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+
+ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+ continue;
+
+ VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+
+ if (spa_lookup(nvpair_name(nvpair)) != NULL)
+ continue;
+ spa = spa_add(nvpair_name(nvpair), NULL);
+
+ /*
+ * We blindly duplicate the configuration here. If it's
+ * invalid, we will catch it when the pool is first opened.
+ */
+ VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(nvlist);
+
+out:
+ if (buf != NULL)
+ kmem_free(buf, fsize);
+
+ kobj_close_file(file);
+}
+
+/*
+ * Synchronize all pools to disk. This must be called with the namespace lock
+ * held.
+ */
+void
+spa_config_sync(void)
+{
+ spa_t *spa = NULL;
+ nvlist_t *config;
+ size_t buflen;
+ char *buf;
+ vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char pathname[128];
+ char pathname2[128];
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ /*
+ * Add all known pools to the configuration list, ignoring those with
+ * alternate root paths.
+ */
+ spa = NULL;
+ while ((spa = spa_next(spa)) != NULL) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
+ VERIFY(nvlist_add_nvlist(config, spa->spa_name,
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+
+ /*
+ * Pack the configuration into a buffer.
+ */
+ VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+
+ /*
+ * Write the configuration to disk. We need to do the traditional
+ * 'write to temporary file, sync, move over original' to make sure we
+ * always have a consistent view of the data.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
+ ZPOOL_CACHE_TMP);
+
+ if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
+ goto out;
+
+ if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+ VOP_FSYNC(vp, FSYNC, kcred) == 0) {
+ (void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
+ spa_config_dir, ZPOOL_CACHE_FILE);
+ (void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+ }
+
+ (void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
+ VN_RELE(vp);
+
+out:
+ (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+ spa_config_generation++;
+
+ kmem_free(buf, buflen);
+ nvlist_free(config);
+}
+
+/*
+ * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+ nvlist_t *pools;
+ spa_t *spa;
+
+ if (*generation == spa_config_generation)
+ return (NULL);
+
+ VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(spa_name(spa), NULL)) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ *generation = spa_config_generation;
+
+ return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config != NULL)
+ nvlist_free(spa->spa_config);
+ spa->spa_config = config;
+ mutex_exit(&spa->spa_config_cache_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+ nvlist_t *config, *nvroot;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, RW_READER));
+
+ if (vd == NULL)
+ vd = rvd;
+
+ /*
+ * If txg is -1, report the current value of spa->spa_config_txg.
+ */
+ if (txg == -1ULL)
+ txg = spa->spa_config_txg;
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ spa_name(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ spa_state(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_guid(spa)) == 0);
+
+ if (vd != rvd) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ vd->vdev_top->vdev_guid) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ if (vd->vdev_isspare)
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
+ 1ULL) == 0);
+ vd = vd->vdev_top; /* label contains top config */
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+
+ return (config);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache.
+ */
+void
+spa_config_update(spa_t *spa, int what)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg;
+ int c;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ txg = spa_last_synced_txg(spa) + 1;
+ if (what == SPA_CONFIG_UPDATE_POOL) {
+ vdev_config_dirty(rvd);
+ } else {
+ /*
+ * If we have top-level vdevs that were added but have
+ * not yet been prepared for allocation, do that now.
+ * (It's safe now because the config cache is up to date,
+ * so it will be able to translate the new DVAs.)
+ * See comments in spa_vdev_add() for full details.
+ */
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ if (tvd->vdev_ms_array == 0) {
+ vdev_init(tvd, txg);
+ vdev_config_dirty(tvd);
+ }
+ }
+ }
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Wait for the mosconfig to be regenerated and synced.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ /*
+ * Update the global config cache to reflect the new mosconfig.
+ */
+ spa_config_sync();
+
+ if (what == SPA_CONFIG_UPDATE_POOL)
+ spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
new file mode 100644
index 000000000000..c52acaf30801
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation. This is actually the union of two distinct logs: the last log,
+ * and the current log. All errors seen are logged to the current log. When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized. This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data. When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known. This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path. Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name. Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+#ifdef _KERNEL
+static uint64_t
+_strtonum(char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ *nptr = str;
+
+ return (val);
+}
+#endif
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+#ifdef _KERNEL
+static void
+name_to_bookmark(char *buf, zbookmark_t *zb)
+{
+ zb->zb_objset = _strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = _strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = _strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+#endif
+
+/*
+ * Log an uncorrectable error to the persistent error log. We add it to the
+ * spa's list of pending errors. The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+ zbookmark_t *zb = &zio->io_logical->io_bookmark;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_tree_t *tree;
+ avl_index_t where;
+
+ /*
+ * If we are trying to import a pool, ignore any errors, as we won't be
+ * writing to the pool any time soon.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * If we have had a request to rotate the log, log it to the next list
+ * instead of the current one.
+ */
+ if (spa->spa_scrub_active || spa->spa_scrub_finished)
+ tree = &spa->spa_errlist_scrub;
+ else
+ tree = &spa->spa_errlist_last;
+
+ search.se_bookmark = *zb;
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *zb;
+ avl_insert(tree, new, where);
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log. This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0, count;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_t zb;
+
+ if (obj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (ENOMEM);
+ }
+
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ zap_cursor_fini(&zc);
+
+ return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+ spa_error_entry_t *se;
+
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+ if (*count == 0)
+ return (ENOMEM);
+
+ if (copyout(&se->se_bookmark, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks. This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists. We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+ int ret = 0;
+
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+
+ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+ if (!ret && !spa->spa_scrub_finished)
+ ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+ count);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ count);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ count);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * Called when a scrub completes. This simply set a bit which tells which AVL
+ * tree to add new errors. spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+ mutex_enter(&spa->spa_errlist_lock);
+
+ ASSERT(!spa->spa_scrub_finished);
+ spa->spa_scrub_finished = B_TRUE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t. Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+ spa_error_entry_t *se;
+ void *cookie;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+ spa_error_entry_t *se;
+ char buf[64];
+ void *cookie;
+
+ if (avl_numnodes(t) != 0) {
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE,
+ 0, tx);
+
+ /* add errors to the current log */
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset,
+ *obj, buf, 1, strlen(name) + 1, name, tx);
+ }
+
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Sync the error log out to disk. This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock. So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them. Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing. Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list. Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ avl_tree_t scrub, last;
+ int scrub_finished;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Bail out early under normal circumstances.
+ */
+ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+ avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ !spa->spa_scrub_finished) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ spa_get_errlists(spa, &last, &scrub);
+ scrub_finished = spa->spa_scrub_finished;
+ spa->spa_scrub_finished = B_FALSE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_enter(&spa->spa_errlog_lock);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ /*
+ * Sync out the current list of errors.
+ */
+ sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+ /*
+ * Rotate the log if necessary.
+ */
+ if (scrub_finished) {
+ if (spa->spa_errlog_last != 0)
+ VERIFY(dmu_object_free(spa->spa_meta_objset,
+ spa->spa_errlog_last, tx) == 0);
+ spa->spa_errlog_last = spa->spa_errlog_scrub;
+ spa->spa_errlog_scrub = 0;
+
+ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+ }
+
+ /*
+ * Sync out any pending scrub errors.
+ */
+ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+ /*
+ * Update the MOS to reflect the new values.
+ */
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+ &spa->spa_errlog_last, tx);
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+ &spa->spa_errlog_scrub, tx);
+
+ dmu_tx_commit(tx);
+
+ mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
new file mode 100644
index 000000000000..66428013a784
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -0,0 +1,354 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/dsl_synctask.h>
+
+/*
+ * Routines to manage the on-disk history log.
+ *
+ * The history log is stored as a dmu object containing
+ * <packed record length, record nvlist> tuples.
+ *
+ * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
+ * "packed record length" is the packed length of the "record nvlist" stored
+ * as a little endian uint64_t.
+ *
+ * The log is implemented as a ring buffer, though the original creation
+ * of the pool ('zpool create') is never overwritten.
+ *
+ * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
+ * of 'spa_history' stores the offsets for logging/retrieving history as
+ * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
+ * where the 'zpool create' record is stored. This allows us to never
+ * overwrite the original creation of the pool. 'sh_phys_max_off' is the
+ * physical ending offset in bytes of the log. This tells you the length of
+ * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
+ * is added, 'sh_eof' is incremented by the the size of the record.
+ * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
+ * This is where the consumer should start reading from after reading in
+ * the 'zpool create' portion of the log.
+ *
+ * 'sh_records_lost' keeps track of how many records have been overwritten
+ * and permanently lost.
+ */
+
+typedef enum history_log_type {
+ LOG_CMD_CREATE,
+ LOG_CMD_NO_CREATE
+} history_log_type_t;
+
+typedef struct history_arg {
+ const char *ha_history_str;
+ history_log_type_t ha_log_type;
+} history_arg_t;
+
+/* convert a logical offset to physical */
+static uint64_t
+spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
+{
+ uint64_t phys_len;
+
+ phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
+ return ((log_off - shpp->sh_pool_create_len) % phys_len
+ + shpp->sh_pool_create_len);
+}
+
+void
+spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
+{
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT(spa->spa_history == 0);
+ spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
+ SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ sizeof (spa_history_phys_t), tx);
+
+ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_HISTORY, sizeof (uint64_t), 1,
+ &spa->spa_history, tx) == 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
+
+ shpp = dbp->db_data;
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Figure out maximum size of history log. We set it at
+ * 1% of pool size, with a max of 32MB and min of 128KB.
+ */
+ shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+ shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
+ shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
+
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Change 'sh_bof' to the beginning of the next record.
+ */
+static int
+spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t firstread, reclen, phys_bof;
+ char buf[sizeof (reclen)];
+ int err;
+
+ phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
+ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
+
+ if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
+ buf)) != 0)
+ return (err);
+ if (firstread != sizeof (reclen)) {
+ if ((err = dmu_read(mos, spa->spa_history,
+ shpp->sh_pool_create_len, sizeof (reclen) - firstread,
+ buf + firstread)) != 0)
+ return (err);
+ }
+
+ reclen = LE_64(*((uint64_t *)buf));
+ shpp->sh_bof += reclen + sizeof (reclen);
+ shpp->sh_records_lost++;
+ return (0);
+}
+
+static int
+spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
+ dmu_tx_t *tx)
+{
+ uint64_t firstwrite, phys_eof;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ ASSERT(MUTEX_HELD(&spa->spa_history_lock));
+
+ /* see if we need to reset logical BOF */
+ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
+ (shpp->sh_eof - shpp->sh_bof) <= len) {
+ if ((err = spa_history_advance_bof(spa, shpp)) != 0)
+ return (err);
+ }
+
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+ firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
+ shpp->sh_eof += len;
+ dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+
+ len -= firstwrite;
+ if (len > 0) {
+ /* write out the rest at the beginning of physical file */
+ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
+ len, (char *)buf + firstwrite, tx);
+ }
+
+ return (0);
+}
+
+/*
+ * Write out a history event.
+ */
+void
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ spa_t *spa = arg1;
+ history_arg_t *hap = arg2;
+ const char *history_str = hap->ha_history_str;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ size_t reclen;
+ uint64_t le_len;
+ nvlist_t *nvrecord;
+ char *record_packed = NULL;
+ int ret;
+
+ if (history_str == NULL)
+ return;
+
+ /*
+ * If we have an older pool that doesn't have a command
+ * history object, create it now.
+ */
+ mutex_enter(&spa->spa_history_lock);
+ if (!spa->spa_history)
+ spa_history_create_obj(spa, tx);
+ mutex_exit(&spa->spa_history_lock);
+
+ /*
+ * Get the offset of where we need to write via the bonus buffer.
+ * Update the offset when the write completes.
+ */
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ shpp = dbp->db_data;
+
+ dmu_buf_will_dirty(dbp, tx);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ /* construct a nvlist of the current time and cmd string */
+ VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
+ gethrestime_sec()) == 0);
+ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0);
+ VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
+ NV_ENCODE_XDR, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_history_lock);
+ if (hap->ha_log_type == LOG_CMD_CREATE)
+ VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
+
+ /* write out the packed length as little endian */
+ le_len = LE_64((uint64_t)reclen);
+ ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
+ if (!ret)
+ ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
+
+ if (!ret && hap->ha_log_type == LOG_CMD_CREATE) {
+ shpp->sh_pool_create_len += sizeof (le_len) + reclen;
+ shpp->sh_bof = shpp->sh_pool_create_len;
+ }
+
+ mutex_exit(&spa->spa_history_lock);
+ nvlist_free(nvrecord);
+ kmem_free(record_packed, reclen);
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Write out a history event.
+ */
+int
+spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create)
+{
+ history_arg_t ha;
+
+ ha.ha_history_str = history_str;
+ ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE;
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
+ spa, &ha, 0));
+}
+
+/*
+ * Read out the command history.
+ */
+int
+spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ uint64_t read_len, phys_read_off, phys_eof;
+ uint64_t leftover = 0;
+ spa_history_phys_t *shpp;
+ int err;
+
+ /*
+ * If the command history doesn't exist (older pool),
+ * that's ok, just return ENOENT.
+ */
+ if (!spa->spa_history)
+ return (ENOENT);
+
+ if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
+ return (err);
+ shpp = dbp->db_data;
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ mutex_enter(&spa->spa_history_lock);
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+
+ if (*offp < shpp->sh_pool_create_len) {
+ /* read in just the zpool create history */
+ phys_read_off = *offp;
+ read_len = MIN(*len, shpp->sh_pool_create_len -
+ phys_read_off);
+ } else {
+ /*
+ * Need to reset passed in offset to BOF if the passed in
+ * offset has since been overwritten.
+ */
+ *offp = MAX(*offp, shpp->sh_bof);
+ phys_read_off = spa_history_log_to_phys(*offp, shpp);
+
+ /*
+ * Read up to the minimum of what the user passed down or
+ * the EOF (physical or logical). If we hit physical EOF,
+ * use 'leftover' to read from the physical BOF.
+ */
+ if (phys_read_off <= phys_eof) {
+ read_len = MIN(*len, phys_eof - phys_read_off);
+ } else {
+ read_len = MIN(*len,
+ shpp->sh_phys_max_off - phys_read_off);
+ if (phys_read_off + *len > shpp->sh_phys_max_off) {
+ leftover = MIN(*len - read_len,
+ phys_eof - shpp->sh_pool_create_len);
+ }
+ }
+ }
+
+ /* offset for consumer to use next */
+ *offp += read_len + leftover;
+
+ /* tell the consumer how much you actually read */
+ *len = read_len + leftover;
+
+ if (read_len == 0) {
+ mutex_exit(&spa->spa_history_lock);
+ dmu_buf_rele(dbp, FTAG);
+ return (0);
+ }
+
+ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
+ if (leftover && err == 0) {
+ err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
+ leftover, buf + read_len);
+ }
+ mutex_exit(&spa->spa_history_lock);
+
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
new file mode 100644
index 000000000000..1de1e5a20e78
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -0,0 +1,1126 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * This lock must be acquired to do any of the following:
+ *
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
+ * - add/remove/attach/detach devices
+ * - Held for the duration of create/destroy/import/export
+ *
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa refcount_t protected by mutex)
+ *
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against SPA_MINREF, but
+ * present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock (per-spa crazy rwlock)
+ *
+ * This SPA special is a recursive rwlock, capable of being acquired from
+ * asynchronous threads. It has protects the spa_t from config changes,
+ * and must be held in the following circumstances:
+ *
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
+ *
+ * spa_config_cache_lock (per-spa mutex)
+ *
+ * This mutex prevents the spa_config nvlist from being updated. No
+ * other locks are required to obtain this lock, although implicitly you
+ * must have the namespace lock or non-zero refcount to have any kind
+ * of spa_t pointer at all.
+ *
+ * The locking order is fairly straightforward:
+ *
+ * spa_namespace_lock -> spa_refcount
+ *
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
+ *
+ * spa_refcount -> spa_config_lock
+ *
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
+ *
+ * spa_namespace_lock -> spa_config_lock
+ *
+ * The namespace lock must always be taken before the config lock.
+ *
+ *
+ * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
+ * are globally visible.
+ *
+ * The namespace is manipulated using the following functions, all which require
+ * the spa_namespace_lock to be held.
+ *
+ * spa_lookup() Lookup a spa_t by name.
+ *
+ * spa_add() Create a new spa_t in the namespace.
+ *
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
+ *
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
+ *
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
+ *
+ * spa_guid_exists() Determine whether a pool/device guid exists.
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
+ *
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
+ *
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
+ *
+ * The spa_config_lock is manipulated using the following functions:
+ *
+ * spa_config_enter() Acquire the config lock as RW_READER or
+ * RW_WRITER. At least one reference on the spa_t
+ * must exist.
+ *
+ * spa_config_exit() Release the config lock.
+ *
+ * spa_config_held() Returns true if the config lock is currently
+ * held in the given state.
+ *
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
+ *
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * for writing.
+ *
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, sync the updated configs to the
+ * cache, and release the namespace lock.
+ *
+ * The spa_name() function also requires either the spa_namespace_lock
+ * or the spa_config_lock, as both are needed to do a rename. spa_rename() is
+ * also implemented within this file since is requires manipulation of the
+ * namespace.
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+static int spa_active_count;
+int spa_max_replication_override = SPA_DVAS_PER_BP;
+
+static kmutex_t spa_spare_lock;
+static avl_tree_t spa_spare_avl;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode;
+
+#ifdef ZFS_DEBUG
+int zfs_flags = ~0;
+#else
+int zfs_flags = 0;
+#endif
+
+/*
+ * zfs_recover can be set to nonzero to attempt to recover from
+ * otherwise-fatal errors, typically caused by on-disk corruption. When
+ * set, calls to zfs_panic_recover() will turn into warning messages.
+ */
+int zfs_recover = 0;
+
+#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+ spa_t search, *spa;
+ avl_index_t where;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ search.spa_name = (char *)name;
+ spa = avl_find(&spa_namespace_avl, &search, &where);
+
+ return (spa);
+}
+
+/*
+ * Create an uninitialized spa_t with the given name. Requires
+ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name, const char *altroot)
+{
+ spa_t *spa;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+ spa->spa_name = spa_strdup(name);
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+ spa->spa_freeze_txg = UINT64_MAX;
+ spa->spa_final_txg = UINT64_MAX;
+
+ mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+
+ refcount_create(&spa->spa_refcount);
+ refcount_create(&spa->spa_config_lock.scl_count);
+
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Set the alternate root, if there is one.
+ */
+ if (altroot) {
+ spa->spa_root = spa_strdup(altroot);
+ spa_active_count++;
+ }
+
+ return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used. Requires
+ * spa_namespace_lock. This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ ASSERT(spa->spa_scrub_thread == NULL);
+
+ avl_remove(&spa_namespace_avl, spa);
+ cv_broadcast(&spa_namespace_cv);
+
+ if (spa->spa_root) {
+ spa_strfree(spa->spa_root);
+ spa_active_count--;
+ }
+
+ if (spa->spa_name)
+ spa_strfree(spa->spa_name);
+
+ spa_config_set(spa, NULL);
+
+ refcount_destroy(&spa->spa_refcount);
+ refcount_destroy(&spa->spa_config_lock.scl_count);
+
+ cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_scrub_io_cv);
+ cv_destroy(&spa->spa_scrub_cv);
+
+ mutex_destroy(&spa->spa_scrub_lock);
+ mutex_destroy(&spa->spa_async_lock);
+ mutex_destroy(&spa->spa_config_cache_lock);
+
+ kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none. If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (prev)
+ return (AVL_NEXT(&spa_namespace_avl, prev));
+ else
+ return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero. Must be called with
+ * spa_namespace_lock held. We really compare against SPA_MINREF, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+}
+
+/*
+ * ==========================================================================
+ * SPA spare tracking
+ * ==========================================================================
+ */
+
+/*
+ * Spares are tracked globally due to the following constraints:
+ *
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
+ * another pool.
+ * - A spare in use in any pool can only be the source of a replacement if
+ * the target is a spare in the same pool.
+ *
+ * We keep track of all spares on the system through the use of a reference
+ * counted AVL tree. When a vdev is added as a spare, or used as a replacement
+ * spare, then we bump the reference count in the AVL tree. In addition, we set
+ * the 'vdev_isspare' member to indicate that the device is a spare (active or
+ * inactive). When a spare is made active (used to replace a device in the
+ * pool), we also keep track of which pool its been made a part of.
+ *
+ * The 'spa_spare_lock' protects the AVL tree. These functions are normally
+ * called under the spa_namespace lock as part of vdev reconfiguration. The
+ * separate spare lock exists for the status query path, which does not need to
+ * be completely consistent with respect to other vdev configuration changes.
+ */
+
+typedef struct spa_spare {
+ uint64_t spare_guid;
+ uint64_t spare_pool;
+ avl_node_t spare_avl;
+ int spare_count;
+} spa_spare_t;
+
+static int
+spa_spare_compare(const void *a, const void *b)
+{
+ const spa_spare_t *sa = a;
+ const spa_spare_t *sb = b;
+
+ if (sa->spare_guid < sb->spare_guid)
+ return (-1);
+ else if (sa->spare_guid > sb->spare_guid)
+ return (1);
+ else
+ return (0);
+}
+
+void
+spa_spare_add(vdev_t *vd)
+{
+ avl_index_t where;
+ spa_spare_t search;
+ spa_spare_t *spare;
+
+ mutex_enter(&spa_spare_lock);
+ ASSERT(!vd->vdev_isspare);
+
+ search.spare_guid = vd->vdev_guid;
+ if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) {
+ spare->spare_count++;
+ } else {
+ spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP);
+ spare->spare_guid = vd->vdev_guid;
+ spare->spare_count = 1;
+ avl_insert(&spa_spare_avl, spare, where);
+ }
+ vd->vdev_isspare = B_TRUE;
+
+ mutex_exit(&spa_spare_lock);
+}
+
+void
+spa_spare_remove(vdev_t *vd)
+{
+ spa_spare_t search;
+ spa_spare_t *spare;
+ avl_index_t where;
+
+ mutex_enter(&spa_spare_lock);
+
+ search.spare_guid = vd->vdev_guid;
+ spare = avl_find(&spa_spare_avl, &search, &where);
+
+ ASSERT(vd->vdev_isspare);
+ ASSERT(spare != NULL);
+
+ if (--spare->spare_count == 0) {
+ avl_remove(&spa_spare_avl, spare);
+ kmem_free(spare, sizeof (spa_spare_t));
+ } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) {
+ spare->spare_pool = 0ULL;
+ }
+
+ vd->vdev_isspare = B_FALSE;
+ mutex_exit(&spa_spare_lock);
+}
+
+boolean_t
+spa_spare_exists(uint64_t guid, uint64_t *pool)
+{
+ spa_spare_t search, *found;
+ avl_index_t where;
+
+ mutex_enter(&spa_spare_lock);
+
+ search.spare_guid = guid;
+ found = avl_find(&spa_spare_avl, &search, &where);
+
+ if (pool) {
+ if (found)
+ *pool = found->spare_pool;
+ else
+ *pool = 0ULL;
+ }
+
+ mutex_exit(&spa_spare_lock);
+
+ return (found != NULL);
+}
+
+void
+spa_spare_activate(vdev_t *vd)
+{
+ spa_spare_t search, *found;
+ avl_index_t where;
+
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+
+ search.spare_guid = vd->vdev_guid;
+ found = avl_find(&spa_spare_avl, &search, &where);
+ ASSERT(found != NULL);
+ ASSERT(found->spare_pool == 0ULL);
+
+ found->spare_pool = spa_guid(vd->vdev_spa);
+ mutex_exit(&spa_spare_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+
+/*
+ * Acquire the config lock. The config lock is a special rwlock that allows for
+ * recursive enters. Because these enters come from the same thread as well as
+ * asynchronous threads working on behalf of the owner, we must unilaterally
+ * allow all reads access as long at least one reader is held (even if a write
+ * is requested). This has the side effect of write starvation, but write locks
+ * are extremely rare, and a solution to this problem would be significantly
+ * more complex (if even possible).
+ *
+ * We would like to assert that the namespace lock isn't held, but this is a
+ * valid use during create.
+ */
+void
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ if (scl->scl_writer != curthread) {
+ if (rw == RW_READER) {
+ while (scl->scl_writer != NULL)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ } else {
+ while (scl->scl_writer != NULL ||
+ !refcount_is_zero(&scl->scl_count))
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_writer = curthread;
+ }
+ }
+
+ (void) refcount_add(&scl->scl_count, tag);
+
+ mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Release the spa config lock, notifying any waiters in the process.
+ */
+void
+spa_config_exit(spa_t *spa, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ ASSERT(!refcount_is_zero(&scl->scl_count));
+ if (refcount_remove(&scl->scl_count, tag) == 0) {
+ cv_broadcast(&scl->scl_cv);
+ scl->scl_writer = NULL; /* OK in either case */
+ }
+
+ mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Returns true if the config lock is held in the given manner.
+ */
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+ boolean_t held;
+
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_WRITER)
+ held = (scl->scl_writer == curthread);
+ else
+ held = !refcount_is_zero(&scl->scl_count);
+ mutex_exit(&scl->scl_lock);
+
+ return (held);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+ /*
+ * Suspend scrub activity while we mess with the config.
+ */
+ spa_scrub_suspend(spa);
+
+ mutex_enter(&spa_namespace_lock);
+
+ spa_config_enter(spa, RW_WRITER, spa);
+
+ return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ int config_changed = B_FALSE;
+
+ ASSERT(txg > spa_last_synced_txg(spa));
+
+ /*
+ * Reassess the DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+ /*
+ * If the config changed, notify the scrub thread that it must restart.
+ */
+ if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
+ config_changed = B_TRUE;
+ spa_scrub_restart(spa, txg);
+ }
+
+ spa_config_exit(spa, spa);
+
+ /*
+ * Allow scrubbing to resume.
+ */
+ spa_scrub_resume(spa);
+
+ /*
+ * Note: this txg_wait_synced() is important because it ensures
+ * that there won't be more than one config change per txg.
+ * This allows us to use the txg as the generation number.
+ */
+ if (error == 0)
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ if (vd != NULL) {
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+ vdev_free(vd);
+ }
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed)
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+/*
+ * Rename a spa_t.
+ */
+int
+spa_rename(const char *name, const char *newname)
+{
+ spa_t *spa;
+ int err;
+
+ /*
+ * Lookup the spa_t and grab the config lock for writing. We need to
+ * actually open the pool so that we can sync out the necessary labels.
+ * It's OK to call spa_open() with the namespace lock held because we
+ * allow recursive calls for other reasons.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((err = spa_open(name, &spa, FTAG)) != 0) {
+ mutex_exit(&spa_namespace_lock);
+ return (err);
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ avl_remove(&spa_namespace_avl, spa);
+ spa_strfree(spa->spa_name);
+ spa->spa_name = spa_strdup(newname);
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Sync all labels to disk with the new names by marking the root vdev
+ * dirty and waiting for it to sync. It will pick up the new pool name
+ * during the sync.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, FTAG);
+
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * Sync the updated config cache.
+ */
+ spa_config_sync();
+
+ spa_close(spa, FTAG);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
+/*
+ * Determine whether a pool with given pool_guid exists. If device_guid is
+ * non-zero, determine whether the pool exists *and* contains a device with the
+ * specified device_guid.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ spa_t *spa;
+ avl_tree_t *t = &spa_namespace_avl;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ continue;
+ if (spa->spa_root_vdev == NULL)
+ continue;
+ if (spa_guid(spa) == pool_guid) {
+ if (device_guid == 0)
+ break;
+
+ if (vdev_lookup_by_guid(spa->spa_root_vdev,
+ device_guid) != NULL)
+ break;
+
+ /*
+ * Check any devices we may in the process of adding.
+ */
+ if (spa->spa_pending_vdev) {
+ if (vdev_lookup_by_guid(spa->spa_pending_vdev,
+ device_guid) != NULL)
+ break;
+ }
+ }
+ }
+
+ return (spa != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+ size_t len;
+ char *new;
+
+ len = strlen(s);
+ new = kmem_alloc(len + 1, KM_SLEEP);
+ bcopy(s, new, len);
+ new[len] = '\0';
+
+ return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+ kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT(range != 0);
+
+ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+ return (r % range);
+}
+
+void
+sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+{
+ int d;
+
+ if (bp == NULL) {
+ (void) snprintf(buf, len, "<NULL>");
+ return;
+ }
+
+ if (BP_IS_HOLE(bp)) {
+ (void) snprintf(buf, len, "<hole>");
+ return;
+ }
+
+ (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
+ (u_longlong_t)BP_GET_LEVEL(bp),
+ dmu_ot[BP_GET_TYPE(bp)].ot_name,
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp));
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ const dva_t *dva = &bp->blk_dva[d];
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "DVA[%d]=<%llu:%llx:%llx> ", d,
+ (u_longlong_t)DVA_GET_VDEV(dva),
+ (u_longlong_t)DVA_GET_OFFSET(dva),
+ (u_longlong_t)DVA_GET_ASIZE(dva));
+ }
+
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+ zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
+ BP_IS_GANG(bp) ? "gang" : "contiguous",
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+ uint64_t freeze_txg = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ if (spa->spa_freeze_txg == UINT64_MAX) {
+ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+ spa->spa_freeze_txg = freeze_txg;
+ }
+ spa_config_exit(spa, FTAG);
+ if (freeze_txg != 0)
+ txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+void
+zfs_panic_recover(const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
+ va_end(adx);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+krwlock_t *
+spa_traverse_rwlock(spa_t *spa)
+{
+ return (&spa->spa_traverse_lock);
+}
+
+int
+spa_traverse_wanted(spa_t *spa)
+{
+ return (spa->spa_traverse_wanted);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+ return (spa->spa_dsl_pool);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+ return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+ spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+ if (spa->spa_root == NULL)
+ buf[0] = '\0';
+ else
+ (void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+ return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+ /*
+ * Accessing the name requires holding either the namespace lock or the
+ * config lock, both of which are required to do a rename.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+
+ return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+ /*
+ * If we fail to parse the config during spa_load(), we can go through
+ * the error path (which posts an ereport) and end up here with no root
+ * vdev. We stash the original pool guid in 'spa_load_guid' to handle
+ * this case.
+ */
+ if (spa->spa_root_vdev != NULL)
+ return (spa->spa_root_vdev->vdev_guid);
+ else
+ return (spa->spa_load_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+ return (spa->spa_first_txg);
+}
+
+int
+spa_state(spa_t *spa)
+{
+ return (spa->spa_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+ return (spa->spa_freeze_txg);
+}
+
+/*
+ * In the future, this may select among different metaslab classes
+ * depending on the zdp. For now, there's no such distinction.
+ */
+metaslab_class_t *
+spa_metaslab_class_select(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+/*
+ * Return how much space is allocated in the pool (ie. sum of all asize)
+ */
+uint64_t
+spa_get_alloc(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+}
+
+/*
+ * Return how much (raid-z inflated) space there is in the pool.
+ */
+uint64_t
+spa_get_space(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/*
+ * Return the amount of raid-z-deflated space in the pool.
+ */
+uint64_t
+spa_get_dspace(spa_t *spa)
+{
+ if (spa->spa_deflate)
+ return (spa->spa_root_vdev->vdev_stat.vs_dspace);
+ else
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_asize(spa_t *spa, uint64_t lsize)
+{
+ /*
+ * For now, the worst case is 512-byte RAID-Z blocks, in which
+ * case the space requirement is exactly 2x; so just assume that.
+ * Add to this the fact that we can have up to 3 DVAs per bp, and
+ * we have to multiply by a total of 6x.
+ */
+ return (lsize * 6);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_version);
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+ /*
+ * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
+ * handle BPs with more than one DVA allocated. Set our max
+ * replication level accordingly.
+ */
+ if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
+ return (1);
+ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
+}
+
+uint64_t
+bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+{
+ int sz = 0, i;
+
+ if (!spa->spa_deflate)
+ return (BP_GET_ASIZE(bp));
+
+ for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+ vdev_t *vd =
+ vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
+ sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ }
+ return (sz);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+ const spa_t *s1 = a1;
+ const spa_t *s2 = a2;
+ int s;
+
+ s = strcmp(s1->spa_name, s2->spa_name);
+ if (s > 0)
+ return (1);
+ if (s < 0)
+ return (-1);
+ return (0);
+}
+
+int
+spa_busy(void)
+{
+ return (spa_active_count);
+}
+
+void
+spa_init(int mode)
+{
+ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+ offsetof(spa_t, spa_avl));
+
+ mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t),
+ offsetof(spa_spare_t, spare_avl));
+
+ spa_mode = mode;
+
+ refcount_init();
+ unique_init();
+ zio_init();
+ dmu_init();
+ zil_init();
+ spa_config_load();
+}
+
+void
+spa_fini(void)
+{
+ spa_evict_all();
+
+ zil_fini();
+ dmu_fini();
+ zio_fini();
+ refcount_fini();
+
+ avl_destroy(&spa_namespace_avl);
+ avl_destroy(&spa_spare_avl);
+
+ cv_destroy(&spa_namespace_cv);
+ mutex_destroy(&spa_namespace_lock);
+ mutex_destroy(&spa_spare_lock);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c
new file mode 100644
index 000000000000..23313a908ab4
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -0,0 +1,501 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/space_map.h>
+
+/*
+ * Space map routines.
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+space_map_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+
+ if (s1->ss_start < s2->ss_start) {
+ if (s1->ss_end > s2->ss_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ss_start > s2->ss_start) {
+ if (s1->ss_start < s2->ss_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
+ kmutex_t *lp)
+{
+ bzero(sm, sizeof (*sm));
+
+ cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
+ avl_create(&sm->sm_root, space_map_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+
+ sm->sm_start = start;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_lock = lp;
+}
+
+void
+space_map_destroy(space_map_t *sm)
+{
+ ASSERT(!sm->sm_loaded && !sm->sm_loading);
+ VERIFY3U(sm->sm_space, ==, 0);
+ avl_destroy(&sm->sm_root);
+ cv_destroy(&sm->sm_load_cv);
+}
+
+void
+space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss_before, *ss_after, *ss;
+ uint64_t end = start + size;
+ int merge_before, merge_after;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY3U(start, >=, sm->sm_start);
+ VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
+ VERIFY(sm->sm_space + size <= sm->sm_size);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
+ zfs_panic_recover("zfs: allocating allocated segment"
+ "(offset=%llu size=%llu)\n",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+
+ /* Make sure we don't overlap with either of our neighbors */
+ VERIFY(ss == NULL);
+
+ ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
+ ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
+
+ merge_before = (ss_before != NULL && ss_before->ss_end == start);
+ merge_after = (ss_after != NULL && ss_after->ss_start == end);
+
+ if (merge_before && merge_after) {
+ avl_remove(&sm->sm_root, ss_before);
+ ss_after->ss_start = ss_before->ss_start;
+ kmem_free(ss_before, sizeof (*ss_before));
+ } else if (merge_before) {
+ ss_before->ss_end = end;
+ } else if (merge_after) {
+ ss_after->ss_start = start;
+ } else {
+ ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+ ss->ss_start = start;
+ ss->ss_end = end;
+ avl_insert(&sm->sm_root, ss, where);
+ }
+
+ sm->sm_space += size;
+}
+
+void
+space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss, *newseg;
+ uint64_t end = start + size;
+ int left_over, right_over;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ /* Make sure we completely overlap with someone */
+ if (ss == NULL) {
+ zfs_panic_recover("zfs: freeing free segment "
+ "(offset=%llu size=%llu)",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+ VERIFY3U(ss->ss_start, <=, start);
+ VERIFY3U(ss->ss_end, >=, end);
+ VERIFY(sm->sm_space - size <= sm->sm_size);
+
+ left_over = (ss->ss_start != start);
+ right_over = (ss->ss_end != end);
+
+ if (left_over && right_over) {
+ newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+ newseg->ss_start = end;
+ newseg->ss_end = ss->ss_end;
+ ss->ss_end = start;
+ avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+ } else if (left_over) {
+ ss->ss_end = start;
+ } else if (right_over) {
+ ss->ss_start = end;
+ } else {
+ avl_remove(&sm->sm_root, ss);
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ sm->sm_space -= size;
+}
+
+int
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss;
+ uint64_t end = start + size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+}
+
+void
+space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+ void *cookie = NULL;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ if (func != NULL)
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+ kmem_free(ss, sizeof (*ss));
+ }
+ sm->sm_space = 0;
+}
+
+void
+space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+}
+
+void
+space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ avl_index_t where;
+ space_seg_t *ss, search;
+ uint64_t end = start + size;
+ uint64_t rm_start, rm_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ search.ss_start = start;
+ search.ss_end = start;
+
+ for (;;) {
+ ss = avl_find(t, &search, &where);
+
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ if (ss == NULL || ss->ss_start >= end)
+ break;
+
+ rm_start = MAX(ss->ss_start, start);
+ rm_end = MIN(ss->ss_end, end);
+
+ space_map_remove(sm, rm_start, rm_end - rm_start);
+ }
+}
+
+/*
+ * Replace smd with the union of smd and sms.
+ */
+void
+space_map_union(space_map_t *smd, space_map_t *sms)
+{
+ avl_tree_t *t = &sms->sm_root;
+ space_seg_t *ss;
+
+ ASSERT(MUTEX_HELD(smd->sm_lock));
+
+ /*
+ * For each source segment, remove any intersections with the
+ * destination, then add the source segment to the destination.
+ */
+ for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
+ space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ }
+}
+
+/*
+ * Wait for any in-progress space_map_load() to complete.
+ */
+void
+space_map_load_wait(space_map_t *sm)
+{
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while (sm->sm_loading)
+ cv_wait(&sm->sm_load_cv, sm->sm_lock);
+}
+
+/*
+ * Note: space_map_load() will drop sm_lock across dmu_read() calls.
+ * The caller must be OK with this.
+ */
+int
+space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os)
+{
+ uint64_t *entry, *entry_map, *entry_map_end;
+ uint64_t bufsize, size, offset, end, space;
+ uint64_t mapstart = sm->sm_start;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ space_map_load_wait(sm);
+
+ if (sm->sm_loaded)
+ return (0);
+
+ sm->sm_loading = B_TRUE;
+ end = smo->smo_objsize;
+ space = smo->smo_alloc;
+
+ ASSERT(sm->sm_ops == NULL);
+ VERIFY3U(sm->sm_space, ==, 0);
+
+ if (maptype == SM_FREE) {
+ space_map_add(sm, sm->sm_start, sm->sm_size);
+ space = sm->sm_size - space;
+ }
+
+ bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+ entry_map = zio_buf_alloc(bufsize);
+
+ mutex_exit(sm->sm_lock);
+ if (end > bufsize)
+ dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+ mutex_enter(sm->sm_lock);
+
+ for (offset = 0; offset < end; offset += bufsize) {
+ size = MIN(end - offset, bufsize);
+ VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+ VERIFY(size != 0);
+
+ dprintf("object=%llu offset=%llx size=%llx\n",
+ smo->smo_object, offset, size);
+
+ mutex_exit(sm->sm_lock);
+ VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
+ entry_map), ==, 0);
+ mutex_enter(sm->sm_lock);
+
+ entry_map_end = entry_map + (size / sizeof (uint64_t));
+ for (entry = entry_map; entry < entry_map_end; entry++) {
+ uint64_t e = *entry;
+
+ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
+ continue;
+
+ (SM_TYPE_DECODE(e) == maptype ?
+ space_map_add : space_map_remove)(sm,
+ (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
+ SM_RUN_DECODE(e) << sm->sm_shift);
+ }
+ }
+ VERIFY3U(sm->sm_space, ==, space);
+
+ zio_buf_free(entry_map, bufsize);
+
+ sm->sm_loading = B_FALSE;
+ sm->sm_loaded = B_TRUE;
+ sm->sm_ops = ops;
+
+ cv_broadcast(&sm->sm_load_cv);
+
+ if (ops != NULL)
+ ops->smop_load(sm);
+
+ return (0);
+}
+
+void
+space_map_unload(space_map_t *sm)
+{
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_loaded && sm->sm_ops != NULL)
+ sm->sm_ops->smop_unload(sm);
+
+ sm->sm_loaded = B_FALSE;
+ sm->sm_ops = NULL;
+
+ space_map_vacate(sm, NULL, NULL);
+}
+
+uint64_t
+space_map_alloc(space_map_t *sm, uint64_t size)
+{
+ uint64_t start;
+
+ start = sm->sm_ops->smop_alloc(sm, size);
+ if (start != -1ULL)
+ space_map_remove(sm, start, size);
+ return (start);
+}
+
+void
+space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ sm->sm_ops->smop_claim(sm, start, size);
+ space_map_remove(sm, start, size);
+}
+
+void
+space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ space_map_add(sm, start, size);
+ sm->sm_ops->smop_free(sm, start, size);
+}
+
+/*
+ * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ */
+void
+space_map_sync(space_map_t *sm, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ void *cookie = NULL;
+ space_seg_t *ss;
+ uint64_t bufsize, start, size, run_len;
+ uint64_t *entry, *entry_map, *entry_map_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_space == 0)
+ return;
+
+ dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
+ smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
+ maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
+ sm->sm_space);
+
+ if (maptype == SM_ALLOC)
+ smo->smo_alloc += sm->sm_space;
+ else
+ smo->smo_alloc -= sm->sm_space;
+
+ bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
+ bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
+ entry_map = zio_buf_alloc(bufsize);
+ entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+ entry = entry_map;
+
+ *entry++ = SM_DEBUG_ENCODE(1) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ size = ss->ss_end - ss->ss_start;
+ start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+
+ sm->sm_space -= size;
+ size >>= sm->sm_shift;
+
+ while (size) {
+ run_len = MIN(size, SM_RUN_MAX);
+
+ if (entry == entry_map_end) {
+ mutex_exit(sm->sm_lock);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ bufsize, entry_map, tx);
+ mutex_enter(sm->sm_lock);
+ smo->smo_objsize += bufsize;
+ entry = entry_map;
+ }
+
+ *entry++ = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+
+ start += run_len;
+ size -= run_len;
+ }
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ if (entry != entry_map) {
+ size = (entry - entry_map) * sizeof (uint64_t);
+ mutex_exit(sm->sm_lock);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ size, entry_map, tx);
+ mutex_enter(sm->sm_lock);
+ smo->smo_objsize += size;
+ }
+
+ zio_buf_free(entry_map, bufsize);
+
+ VERIFY3U(sm->sm_space, ==, 0);
+}
+
+void
+space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+{
+ VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
+
+ smo->smo_objsize = 0;
+ smo->smo_alloc = 0;
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 000000000000..f58ffc059f91
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,109 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ARC_H
+#define _SYS_ARC_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+typedef int arc_evict_func_t(void *private);
+
+/* generic arc_done_func_t's which you can use */
+arc_done_func_t arc_bcopy_func;
+arc_done_func_t arc_getbuf_func;
+
+struct arc_buf {
+ arc_buf_hdr_t *b_hdr;
+ arc_buf_t *b_next;
+ void *b_data;
+ arc_evict_func_t *b_efunc;
+ void *b_private;
+};
+
+typedef enum arc_buf_contents {
+ ARC_BUFC_UNDEF, /* buffer contents undefined */
+ ARC_BUFC_DATA, /* buffer contains data */
+ ARC_BUFC_METADATA /* buffer contains metadata */
+} arc_buf_contents_t;
+/*
+ * These are the flags we pass into calls to the arc
+ */
+#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
+#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
+#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
+#define ARC_CACHED (1 << 4) /* I/O was already in cache */
+
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+ arc_buf_contents_t type);
+void arc_buf_add_ref(arc_buf_t *buf, void *tag);
+int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+int arc_buf_size(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_released(arc_buf_t *buf);
+int arc_has_callback(arc_buf_t *buf);
+void arc_buf_freeze(arc_buf_t *buf);
+void arc_buf_thaw(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t *arc_flags, zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb);
+int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags);
+int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+
+void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
+int arc_buf_evict(arc_buf_t *buf);
+
+void arc_flush(void);
+void arc_tempreserve_clear(uint64_t tempreserve);
+int arc_tempreserve_space(uint64_t tempreserve);
+
+void arc_init(void);
+void arc_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 000000000000..b4c83765c873
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_BPLIST_H
+#define _SYS_BPLIST_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpl_entries blkptr_t's, representing
+ * a total of bpl_bytes physical space.
+ */
+ uint64_t bpl_entries;
+ uint64_t bpl_bytes;
+ uint64_t bpl_comp;
+ uint64_t bpl_uncomp;
+} bplist_phys_t;
+
+#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t))
+
+typedef struct bplist_q {
+ blkptr_t bpq_blk;
+ void *bpq_next;
+} bplist_q_t;
+
+typedef struct bplist {
+ kmutex_t bpl_lock;
+ objset_t *bpl_mos;
+ uint64_t bpl_object;
+ uint8_t bpl_blockshift;
+ uint8_t bpl_bpshift;
+ uint8_t bpl_havecomp;
+ bplist_q_t *bpl_queue;
+ bplist_phys_t *bpl_phys;
+ dmu_buf_t *bpl_dbuf;
+ dmu_buf_t *bpl_cached_dbuf;
+} bplist_t;
+
+extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
+extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
+extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern void bplist_close(bplist_t *bpl);
+extern boolean_t bplist_empty(bplist_t *bpl);
+extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
+extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
+extern int bplist_space(bplist_t *bpl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 000000000000..d33657b9e67c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DBUF_H
+#define _SYS_DBUF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DB_BONUS_BLKID (-1ULL)
+#define IN_DMU_SYNC 2
+
+/*
+ * define flags for dbuf_read
+ */
+
+#define DB_RF_MUST_SUCCEED (1 << 0)
+#define DB_RF_CANFAIL (1 << 1)
+#define DB_RF_HAVESTRUCT (1 << 2)
+#define DB_RF_NOPREFETCH (1 << 3)
+#define DB_RF_NEVERWAIT (1 << 4)
+#define DB_RF_CACHED (1 << 5)
+
+/*
+ * The state transition diagram for dbufs looks like:
+ *
+ * +----> READ ----+
+ * | |
+ * | V
+ * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
+ * | ^
+ * | |
+ * +----> FILL ----+
+ */
+typedef enum dbuf_states {
+ DB_UNCACHED,
+ DB_FILL,
+ DB_READ,
+ DB_CACHED,
+ DB_EVICTING
+} dbuf_states_t;
+
+struct objset_impl;
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+#define LIST_LINK_INACTIVE(link) \
+ ((link)->list_next == NULL && (link)->list_prev == NULL)
+
+struct dmu_buf_impl;
+
+typedef enum override_states {
+ DR_NOT_OVERRIDDEN,
+ DR_IN_DMU_SYNC,
+ DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+ /* link on our parents dirty list */
+ list_node_t dr_dirty_node;
+
+ /* transaction group this data will sync in */
+ uint64_t dr_txg;
+
+ /* zio of outstanding write IO */
+ zio_t *dr_zio;
+
+ /* pointer back to our dbuf */
+ struct dmu_buf_impl *dr_dbuf;
+
+ /* pointer to next dirty record */
+ struct dbuf_dirty_record *dr_next;
+
+ /* pointer to parent dirty record */
+ struct dbuf_dirty_record *dr_parent;
+
+ union dirty_types {
+ struct dirty_indirect {
+
+ /* protect access to list */
+ kmutex_t dr_mtx;
+
+ /* Our list of dirty children */
+ list_t dr_children;
+ } di;
+ struct dirty_leaf {
+
+ /*
+ * dr_data is set when we dirty the buffer
+ * so that we can retain the pointer even if it
+ * gets COW'd in a subsequent transaction group.
+ */
+ arc_buf_t *dr_data;
+ blkptr_t dr_overridden_by;
+ override_states_t dr_override_state;
+ } dl;
+ } dt;
+} dbuf_dirty_record_t;
+
+typedef struct dmu_buf_impl {
+ /*
+ * The following members are immutable, with the exception of
+ * db.db_data, which is protected by db_mtx.
+ */
+
+ /* the publicly visible structure */
+ dmu_buf_t db;
+
+ /* the objset we belong to */
+ struct objset_impl *db_objset;
+
+ /*
+ * the dnode we belong to (NULL when evicted)
+ */
+ struct dnode *db_dnode;
+
+ /*
+ * our parent buffer; if the dnode points to us directly,
+ * db_parent == db_dnode->dn_dbuf
+ * only accessed by sync thread ???
+ * (NULL when evicted)
+ */
+ struct dmu_buf_impl *db_parent;
+
+ /*
+ * link for hash table of all dmu_buf_impl_t's
+ */
+ struct dmu_buf_impl *db_hash_next;
+
+ /* our block number */
+ uint64_t db_blkid;
+
+ /*
+ * Pointer to the blkptr_t which points to us. May be NULL if we
+ * don't have one yet. (NULL when evicted)
+ */
+ blkptr_t *db_blkptr;
+
+ /*
+ * Our indirection level. Data buffers have db_level==0.
+ * Indirect buffers which point to data buffers have
+ * db_level==1. etc. Buffers which contain dnodes have
+ * db_level==0, since the dnodes are stored in a file.
+ */
+ uint8_t db_level;
+
+ /* db_mtx protects the members below */
+ kmutex_t db_mtx;
+
+ /*
+ * Current state of the buffer
+ */
+ dbuf_states_t db_state;
+
+ /*
+ * Refcount accessed by dmu_buf_{hold,rele}.
+ * If nonzero, the buffer can't be destroyed.
+ * Protected by db_mtx.
+ */
+ refcount_t db_holds;
+
+ /* buffer holding our data */
+ arc_buf_t *db_buf;
+
+ kcondvar_t db_changed;
+ dbuf_dirty_record_t *db_data_pending;
+
+ /* pointer to most recent dirty record for this buffer */
+ dbuf_dirty_record_t *db_last_dirty;
+
+ /*
+ * Our link on the owner dnodes's dn_dbufs list.
+ * Protected by its dn_dbufs_mtx.
+ */
+ list_node_t db_link;
+
+ /* Data which is unique to data (leaf) blocks: */
+
+ /* stuff we store for the user (see dmu_buf_set_user) */
+ void *db_user_ptr;
+ void **db_user_data_ptr_ptr;
+ dmu_buf_evict_func_t *db_evict_func;
+
+ uint8_t db_immediate_evict;
+ uint8_t db_freed_in_flight;
+
+ uint8_t db_dirtycnt;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define DBUF_MUTEXES 256
+#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+ uint64_t hash_table_mask;
+ dmu_buf_impl_t **hash_table;
+ kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+ void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+ void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+
+dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+
+void dbuf_clear(dmu_buf_impl_t *db);
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+
+void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+ struct dmu_tx *);
+
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+#define DBUF_GET_BUFC_TYPE(db) \
+ ((((db)->db_level > 0) || \
+ (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but gcc does not
+ * support that preprocessor token.
+ */
+#define dprintf_dbuf(dbuf, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dbuf)->db.db_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj); \
+ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+ "obj=%s lvl=%u blkid=%lld " fmt, \
+ __db_buf, (dbuf)->db_level, \
+ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
+ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DBUF_VERIFY(db) dbuf_verify(db)
+
+#else
+
+#define dprintf_dbuf(db, fmt, ...)
+#define dprintf_dbuf_bp(db, bp, fmt, ...)
+#define DBUF_VERIFY(db)
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 000000000000..f534015ff074
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,586 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_H
+#define _SYS_DMU_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA. That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct page;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+struct zbookmark;
+struct spa;
+struct nvlist;
+struct objset_impl;
+struct file;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPLIST, /* UINT64 */
+ DMU_OT_BPLIST_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DIR, /* UINT64 */
+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_UNLINKED_SET, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
+ DMU_OT_SPA_HISTORY, /* UINT8 */
+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
+ DMU_OT_POOL_PROPS, /* ZAP */
+
+ DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define DS_MODE_NONE 0 /* invalid, to aid debugging */
+#define DS_MODE_STANDARD 1 /* normal access, no special needs */
+#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */
+#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */
+#define DS_MODE_LEVELS 4
+#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1))
+#define DS_MODE_READONLY 0x8
+#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY)
+#define DS_MODE_INCONSISTENT 0x10
+#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT)
+
+#define DS_FIND_SNAPSHOTS (1<<0)
+#define DS_FIND_CHILDREN (1<<1)
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_evict_dbufs(objset_t *os, int try);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_snapshots_destroy(char *fsname, char *snapname);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_rename(const char *name, const char *newname);
+int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+typedef struct dmu_buf {
+ uint64_t db_object; /* object that this buffer is part of */
+ uint64_t db_offset; /* byte offset in this object */
+ uint64_t db_size; /* size of buffer in bytes */
+ void *db_data; /* data in buffer */
+} dmu_buf_t;
+
+typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
+
+/*
+ * Callback function to perform byte swapping on a block.
+ */
+typedef void dmu_byteswap_func_t(void *buf, size_t size);
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
+#define DMU_POOL_SPARES "spares"
+#define DMU_POOL_DEFLATE "deflate"
+#define DMU_POOL_HISTORY "history"
+#define DMU_POOL_PROPS "pool_props"
+
+/*
+ * Allocate an object from this objset. The range of object numbers
+ * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg. The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number. If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out. It will be updated to be the next
+ * object which is allocated. Ignore objects which have not been
+ * modified since txg.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp,
+ boolean_t hole, uint64_t txg);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first. If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size. If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+ int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode. The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode. The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx);
+
+/*
+ * Decide how many copies of a given block we should make. Can be from
+ * 1 to SPA_DVAS_PER_BP.
+ */
+int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
+ dmu_object_type_t ot);
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data. As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
+ * buffer as well. You must release your hold with dmu_buf_rele().
+ */
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
+int dmu_bonus_max(void);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory. You must release the hold with
+ * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
+ * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data. The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **);
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object. A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array. The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array. You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+
+/*
+ * Returns NULL on success, or the existing user ptr if it's already
+ * been set.
+ *
+ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ *
+ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
+ * will be set to db->db_data when you are allowed to access it. Note
+ * that db->db_data (the pointer) can change when you do dmu_buf_read(),
+ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
+ * *user_data_ptr_ptr will be set to the new value when it changes.
+ *
+ * If non-NULL, pageout func will be called when this buffer is being
+ * excised from the cache, so that you can clean up the data structure
+ * pointed to by user_ptr.
+ *
+ * dmu_evict_user() will call the pageout func for all buffers in a
+ * objset with a given pageout func.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+/*
+ * set_user_ie is the same as set_user, but request immediate eviction
+ * when hold count goes to zero.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+ void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+
+/*
+ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()). The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction. Then you must assign
+ * the transaction to a transaction group. Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction. You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned. You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define DMU_NEW_OBJECT (-1ULL)
+#define DMU_OBJECT_END (-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+ uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * Free up the data blocks for a defined range of a file. If size is
+ * zero, the range from offset to end-of-file is freed.
+ */
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx);
+int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, struct page *pp, dmu_tx_t *tx);
+
+extern int zfs_prefetch_disable;
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t len);
+
+typedef struct dmu_object_info {
+ /* All sizes are in bytes. */
+ uint32_t doi_data_block_size;
+ uint32_t doi_metadata_block_size;
+ uint64_t doi_bonus_size;
+ dmu_object_type_t doi_type;
+ dmu_object_type_t doi_bonus_type;
+ uint8_t doi_indirection; /* 2 = dnode->indirect->data */
+ uint8_t doi_checksum;
+ uint8_t doi_compress;
+ uint8_t doi_pad[5];
+ /* Values below are number of 512-byte blocks. */
+ uint64_t doi_physical_blks; /* data + metadata */
+ uint64_t doi_max_block_offset;
+} dmu_object_info_t;
+
+typedef struct dmu_object_type_info {
+ dmu_byteswap_func_t *ot_byteswap;
+ boolean_t ot_metadata;
+ char *ot_name;
+} dmu_object_type_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+ u_longlong_t *nblk512);
+
+typedef struct dmu_objset_stats {
+ uint64_t dds_num_clones; /* number of clones of this */
+ uint64_t dds_creation_txg;
+ dmu_objset_type_t dds_type;
+ uint8_t dds_is_snapshot;
+ uint8_t dds_inconsistent;
+ char dds_clone_of[MAXNAMELEN];
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+
+/*
+ * Add entries to the nvlist for all the objset's properties. See
+ * zfs_prop_table[] and zfs(1m) for details on the properties.
+ */
+void dmu_objset_stats(objset_t *os, struct nvlist *nv);
+
+/*
+ * Get the space usage statistics for statvfs().
+ *
+ * refdbytes is the amount of space "referenced" by this objset.
+ * availbytes is the amount of space available to this objset, taking
+ * into account quotas & reservations, assuming that no other objsets
+ * use the space first. These values correspond to the 'referenced' and
+ * 'available' properties, described in the zfs(1m) manpage.
+ *
+ * usedobjs and availobjs are the number of objects currently allocated,
+ * and available.
+ */
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+
+/*
+ * The fsid_guid is a 56-bit ID that can change to avoid collisions.
+ * (Contrast with the ds_guid which is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.)
+ */
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *id, uint64_t *offp);
+extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * Synchronous write.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absense of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
+ * Note that while the data covered by this function will be on stable
+ * storage when the write completes this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
+int dmu_sync(struct zio *zio, dmu_buf_t *db,
+ struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+ uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+ uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+ dmu_traverse_cb_t cb, void *arg);
+
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
+int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
+ boolean_t force, struct file *fp, uint64_t voffset);
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 000000000000..807011e94ffc
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,237 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define _SYS_DMU_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU. Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
+ *
+ * dp_config_rwlock
+ * must be held before: everything
+ * protects dd namespace changes
+ * protects property changes globally
+ * held from:
+ * dsl_dir_open/r:
+ * dsl_dir_create_sync/w:
+ * dsl_dir_sync_destroy/w:
+ * dsl_dir_rename_sync/w:
+ * dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ * must be held before:
+ * everything except dp_config_rwlock
+ * protects os_obj_next
+ * held from:
+ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ * must be held before:
+ * everything except dp_config_rwlock and os_obj_lock
+ * protects structure of dnode (eg. nlevels)
+ * db_blkptr can change when syncing out change to nlevels
+ * dn_maxblkid
+ * dn_nlevels
+ * dn_*blksz*
+ * phys nlevels, maxblkid, physical blkptr_t's (?)
+ * held from:
+ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ * dmu_tx_count_free:
+ * dbuf_read_impl: db_mtx, dmu_zfetch()
+ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ * dbuf_new_size: db_mtx
+ * dbuf_dirty: db_mtx
+ * dbuf_findbp: (callers, phys? - the real need)
+ * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ * dnode_sync/w (increase_indirection): db_mtx (phys)
+ * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ * dnode_new_blkid/w: (dn_maxblkid)
+ * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ * dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ * must be held before:
+ * db_mtx, hash_mutexes
+ * protects:
+ * dn_dbufs
+ * dn_evicted
+ * held from:
+ * dmu_evict_user: db_mtx (dn_dbufs)
+ * dbuf_free_range: db_mtx (dn_dbufs)
+ * dbuf_remove_ref: db_mtx, callees:
+ * dbuf_hash_remove: hash_mutexes, db_mtx
+ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ * dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ * must be held before:
+ * db_mtx
+ * protects dbuf_hash_table (global) and db_hash_next
+ * held from:
+ * dbuf_find: db_mtx
+ * dbuf_hash_insert: db_mtx
+ * dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ * must be held before:
+ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ * protects:
+ * db_state
+ * db_holds
+ * db_buf
+ * db_changed
+ * db_data_pending
+ * db_dirtied
+ * db_link
+ * db_dirty_node (??)
+ * db_dirtycnt
+ * db_d.*
+ * db.*
+ * held from:
+ * dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * dbuf_undirty: dn_dirty_mtx (db_d)
+ * dbuf_write_done: dn_dirty_mtx (db_state)
+ * dbuf_*
+ * dmu_buf_update_user: none (db_d)
+ * dmu_evict_user: none (db_d) (maybe can eliminate)
+ * dbuf_find: none (db_holds)
+ * dbuf_hash_insert: none (db_holds)
+ * dmu_buf_read_array_impl: none (db_state, db_changed)
+ * dmu_sync: none (db_dirty_node, db_d)
+ * dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ * protects:
+ * dn_dirty_dbufs
+ * dn_ranges
+ * phys accounting
+ * dn_allocated_txg
+ * dn_free_txg
+ * dn_assigned_txg
+ * dd_assigned_tx
+ * dn_notxholds
+ * dn_dirtyctx
+ * dn_dirtyctx_firstset
+ * (dn_phys copy fields?)
+ * (dn_phys contents?)
+ * held from:
+ * dnode_*
+ * dbuf_dirty: none
+ * dbuf_sync: none (phys accounting)
+ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ * dbuf_write_done: none (phys accounting)
+ * dmu_object_info_from_dnode: none (accounting)
+ * dmu_tx_commit: none
+ * dmu_tx_hold_object_impl: none
+ * dmu_tx_try_assign: dn_notxholds(cv)
+ * dmu_tx_unassign: none
+ *
+ * dd_lock (leaf)
+ * protects:
+ * dd_prop_cbs
+ * dd_sync_*
+ * dd_used_bytes
+ * dd_tempreserved
+ * dd_space_towrite
+ * dd_myname
+ * dd_phys accounting?
+ * held from:
+ * dsl_dir_*
+ * dsl_prop_changed_notify: none (dd_prop_cbs)
+ * dsl_prop_register: none (dd_prop_cbs)
+ * dsl_prop_unregister: none (dd_prop_cbs)
+ * dsl_dataset_block_freeable: none (dd_sync_*)
+ *
+ * os_lock (leaf)
+ * protects:
+ * os_dirty_dnodes
+ * os_free_dnodes
+ * os_dnodes
+ * os_downgraded_dbufs
+ * dn_dirtyblksz
+ * dn_dirty_link
+ * held from:
+ * dnode_create: none (os_dnodes)
+ * dnode_destroy: none (os_dnodes)
+ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock (leaf)
+ * protects:
+ * ds_user_ptr
+ * ds_user_evice_func
+ * ds_open_refcount
+ * ds_snapname
+ * ds_phys accounting
+ * held from:
+ * dsl_dataset_*
+ *
+ * dr_mtx (leaf)
+ * protects:
+ * dr_children
+ * held from:
+ * dbuf_dirty
+ * dbuf_undirty
+ * dbuf_sync_indirect
+ * dnode_new_blkid
+ */
+
+struct objset;
+struct dmu_pool;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 000000000000..8293a3b4076a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_OBJSET_H
+#define _SYS_DMU_OBJSET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/arc.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dmu_tx;
+struct objset_impl;
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+ sizeof (uint64_t)];
+} objset_phys_t;
+
+struct objset {
+ struct objset_impl *os;
+ int os_mode;
+};
+
+typedef struct objset_impl {
+ /* Immutable: */
+ struct dsl_dataset *os_dsl_dataset;
+ spa_t *os_spa;
+ arc_buf_t *os_phys_buf;
+ objset_phys_t *os_phys;
+ dnode_t *os_meta_dnode;
+ zilog_t *os_zil;
+ objset_t os;
+ uint8_t os_checksum; /* can change, under dsl_dir's locks */
+ uint8_t os_compress; /* can change, under dsl_dir's locks */
+ uint8_t os_copies; /* can change, under dsl_dir's locks */
+ uint8_t os_md_checksum;
+ uint8_t os_md_compress;
+
+ /* no lock needed: */
+ struct dmu_tx *os_synctx; /* XXX sketchy */
+ blkptr_t *os_rootbp;
+
+ /* Protected by os_obj_lock */
+ kmutex_t os_obj_lock;
+ uint64_t os_obj_next;
+
+ /* Protected by os_lock */
+ kmutex_t os_lock;
+ list_t os_dirty_dnodes[TXG_SIZE];
+ list_t os_free_dnodes[TXG_SIZE];
+ list_t os_dnodes;
+ list_t os_downgraded_dbufs;
+} objset_impl_t;
+
+#define DMU_META_DNODE_OBJECT 0
+
+/* called from zpl */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+void dmu_objset_stats(objset_t *os, nvlist_t *nv);
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+int dmu_objset_evict_dbufs(objset_t *os, int try);
+
+/* called from dsl */
+void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+ blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+ objset_impl_t **osip);
+void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 000000000000..ea9fa6c1e36c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TRAVERSE_H
+#define _SYS_DMU_TRAVERSE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/arc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ADVANCE_POST 0 /* post-order traversal */
+#define ADVANCE_PRE 0x01 /* pre-order traversal */
+#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */
+#define ADVANCE_DATA 0x04 /* read user data blocks */
+#define ADVANCE_HOLES 0x08 /* visit holes */
+#define ADVANCE_ZIL 0x10 /* visit intent log blocks */
+#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */
+
+#define ZB_NO_LEVEL -2
+#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */
+#define ZB_MAXBLKID (1ULL << 62)
+#define ZB_MAXOBJSET (1ULL << 62)
+#define ZB_MAXOBJECT (1ULL << 62)
+
+#define ZB_MOS_CACHE 0
+#define ZB_MDN_CACHE 1
+#define ZB_DN_CACHE 2
+#define ZB_DEPTH 3
+
+typedef struct zseg {
+ uint64_t seg_mintxg;
+ uint64_t seg_maxtxg;
+ zbookmark_t seg_start;
+ zbookmark_t seg_end;
+ list_node_t seg_node;
+} zseg_t;
+
+typedef struct traverse_blk_cache {
+ zbookmark_t bc_bookmark;
+ blkptr_t bc_blkptr;
+ void *bc_data;
+ dnode_phys_t *bc_dnode;
+ int bc_errno;
+ int bc_pad1;
+ uint64_t bc_pad2;
+} traverse_blk_cache_t;
+
+typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
+
+struct traverse_handle {
+ spa_t *th_spa;
+ blkptr_cb_t *th_func;
+ void *th_arg;
+ uint16_t th_advance;
+ uint16_t th_locked;
+ int th_zio_flags;
+ list_t th_seglist;
+ traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+ traverse_blk_cache_t th_zil_cache;
+ uint64_t th_hits;
+ uint64_t th_arc_hits;
+ uint64_t th_reads;
+ uint64_t th_callbacks;
+ uint64_t th_syncs;
+ uint64_t th_restarts;
+ zbookmark_t th_noread;
+ zbookmark_t th_lastcb;
+};
+
+int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
+ int advance, blkptr_cb_t func, void *arg);
+
+traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
+ int advance, int zio_flags);
+void traverse_fini(traverse_handle_t *th);
+
+void traverse_add_dnode(traverse_handle_t *th,
+ uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
+void traverse_add_objset(traverse_handle_t *th,
+ uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
+void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
+
+int traverse_more(traverse_handle_t *th);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 000000000000..89f4799b57fe
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,134 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TX_H
+#define _SYS_DMU_TX_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dmu_tx_hold;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+ /*
+ * No synchronization is needed because a tx can only be handled
+ * by one thread.
+ */
+ list_t tx_holds; /* list of dmu_tx_hold_t */
+ objset_t *tx_objset;
+ struct dsl_dir *tx_dir;
+ struct dsl_pool *tx_pool;
+ uint64_t tx_txg;
+ uint64_t tx_lastsnap_txg;
+ uint64_t tx_lasttried_txg;
+ txg_handle_t tx_txgh;
+ void *tx_tempreserve_cookie;
+ struct dmu_tx_hold *tx_needassign_txh;
+ uint8_t tx_anyobj;
+ int tx_err;
+#ifdef ZFS_DEBUG
+ uint64_t tx_space_towrite;
+ uint64_t tx_space_tofree;
+ uint64_t tx_space_tooverwrite;
+ refcount_t tx_space_written;
+ refcount_t tx_space_freed;
+#endif
+};
+
+enum dmu_tx_hold_type {
+ THT_NEWOBJECT,
+ THT_WRITE,
+ THT_BONUS,
+ THT_FREE,
+ THT_ZAP,
+ THT_SPACE,
+ THT_NUMTYPES
+};
+
+typedef struct dmu_tx_hold {
+ dmu_tx_t *txh_tx;
+ list_node_t txh_node;
+ struct dnode *txh_dnode;
+ uint64_t txh_space_towrite;
+ uint64_t txh_space_tofree;
+ uint64_t txh_space_tooverwrite;
+#ifdef ZFS_DEBUG
+ enum dmu_tx_hold_type txh_type;
+ uint64_t txh_arg1;
+ uint64_t txh_arg2;
+#endif
+} dmu_tx_hold_t;
+
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+void dmu_tx_wait(dmu_tx_t *tx);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
+void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
+#else
+#define DMU_TX_DIRTY_BUF(tx, db)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TX_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 000000000000..c94bced933af
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DFETCH_H
+#define _DFETCH_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint64_t zfetch_array_rd_sz;
+
+struct dnode; /* so we can reference dnode */
+
+typedef enum zfetch_dirn {
+ ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
+ ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
+} zfetch_dirn_t;
+
+typedef struct zstream {
+ uint64_t zst_offset; /* offset of starting block in range */
+ uint64_t zst_len; /* length of range, in blocks */
+ zfetch_dirn_t zst_direction; /* direction of prefetch */
+ uint64_t zst_stride; /* length of stride, in blocks */
+ uint64_t zst_ph_offset; /* prefetch offset, in blocks */
+ uint64_t zst_cap; /* prefetch limit (cap), in blocks */
+ kmutex_t zst_lock; /* protects stream */
+ clock_t zst_last; /* lbolt of last prefetch */
+ avl_node_t zst_node; /* embed avl node here */
+} zstream_t;
+
+typedef struct zfetch {
+ krwlock_t zf_rwlock; /* protects zfetch structure */
+ list_t zf_stream; /* AVL tree of zstream_t's */
+ struct dnode *zf_dnode; /* dnode that owns this zfetch */
+ uint32_t zf_stream_cnt; /* # of active streams */
+ uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
+} zfetch_t;
+
+void dmu_zfetch_init(zfetch_t *, struct dnode *);
+void dmu_zfetch_rele(zfetch_t *);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DFETCH_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 000000000000..327e538cf809
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,267 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DNODE_H
+#define _SYS_DNODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 10 /* 1k */
+#define DN_MAX_INDBLKSHIFT 14 /* 16k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define DNODE_SIZE (1 << DNODE_SHIFT)
+#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset_impl;
+struct zio;
+
+enum dnode_dirtycontext {
+ DN_UNDIRTIED,
+ DN_DIRTY_OPEN,
+ DN_DIRTY_SYNC
+};
+
+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
+#define DNODE_FLAG_USED_BYTES (1<<0)
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_flags; /* DNODE_FLAG_* */
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_pad2[4];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_used; /* bytes (or sectors) of disk space */
+
+ uint64_t dn_pad3[4];
+
+ blkptr_t dn_blkptr[1];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+typedef struct dnode {
+ /*
+ * dn_struct_rwlock protects the structure of the dnode,
+ * including the number of levels of indirection (dn_nlevels),
+ * dn_maxblkid, and dn_next_*
+ */
+ krwlock_t dn_struct_rwlock;
+
+ /*
+ * Our link on dataset's dd_dnodes list.
+ * Protected by dd_accounting_mtx.
+ */
+ list_node_t dn_link;
+
+ /* immutable: */
+ struct objset_impl *dn_objset;
+ uint64_t dn_object;
+ struct dmu_buf_impl *dn_dbuf;
+ dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+ /*
+ * Copies of stuff in dn_phys. They're valid in the open
+ * context (eg. even before the dnode is first synced).
+ * Where necessary, these are protected by dn_struct_rwlock.
+ */
+ dmu_object_type_t dn_type; /* object type */
+ uint16_t dn_bonuslen; /* bonus length */
+ uint8_t dn_bonustype; /* bonus type */
+ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_nlevels;
+ uint8_t dn_indblkshift;
+ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint16_t dn_datablkszsec; /* in 512b sectors */
+ uint32_t dn_datablksz; /* in bytes */
+ uint64_t dn_maxblkid;
+ uint8_t dn_next_nlevels[TXG_SIZE];
+ uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+
+ /* protected by os_lock: */
+ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
+
+ /* protected by dn_mtx: */
+ kmutex_t dn_mtx;
+ list_t dn_dirty_records[TXG_SIZE];
+ avl_tree_t dn_ranges[TXG_SIZE];
+ uint64_t dn_allocated_txg;
+ uint64_t dn_free_txg;
+ uint64_t dn_assigned_txg;
+ kcondvar_t dn_notxholds;
+ enum dnode_dirtycontext dn_dirtyctx;
+ uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
+
+ /* protected by own devices */
+ refcount_t dn_tx_holds;
+ refcount_t dn_holds;
+
+ kmutex_t dn_dbufs_mtx;
+ list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+
+ /* parent IO for current sync write */
+ zio_t *dn_zio;
+
+ /* holds prefetch structure */
+ struct zfetch dn_zfetch;
+} dnode_t;
+
+typedef struct free_range {
+ avl_node_t fr_node;
+ uint64_t fr_blkid;
+ uint64_t fr_nblks;
+} free_range_t;
+
+dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+ uint64_t object);
+void dnode_special_close(dnode_t *dn);
+
+int dnode_hold(struct objset_impl *dd, uint64_t object,
+ void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+ void *ref, dnode_t **dnp);
+void dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+uint64_t dnode_current_max_length(dnode_t *dn);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_clear_range(dnode_t *dn, uint64_t blkid,
+ uint64_t nblks, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
+ uint64_t blkfill, uint64_t txg);
+int dnode_evict_dbufs(dnode_t *dn, int try);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define dprintf_dnode(dn, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dn)->dn_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj);\
+ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+ __db_buf, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DNODE_VERIFY(dn) dnode_verify(dn)
+#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
+
+#else
+
+#define dprintf_dnode(db, fmt, ...)
+#define DNODE_VERIFY(dn)
+#define FREE_VERIFY(db, start, end, tx)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DNODE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 000000000000..8929dbc87523
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,185 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_DATASET_H
+#define _SYS_DSL_DATASET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/bplist.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
+
+#define DS_FLAG_INCONSISTENT (1ULL<<0)
+/*
+ * NB: nopromote can not yet be set, but we want support for it in this
+ * on-disk version, so that we don't need to upgrade for it later. It
+ * will be needed when we implement 'zfs split' (where the split off
+ * clone should not be promoted).
+ */
+#define DS_FLAG_NOPROMOTE (1ULL<<1)
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj;
+ uint64_t ds_prev_snap_obj;
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj;
+ uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj;
+ uint64_t ds_used_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_flags;
+ blkptr_t ds_bp;
+ uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+ /* Immutable: */
+ struct dsl_dir *ds_dir;
+ dsl_dataset_phys_t *ds_phys;
+ dmu_buf_t *ds_dbuf;
+ uint64_t ds_object;
+
+ /* only used in syncing context: */
+ struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
+
+ /* has internal locking: */
+ bplist_t ds_deadlist;
+
+ /* protected by lock on pool's dp_dirty_datasets list */
+ txg_node_t ds_dirty_link;
+ list_node_t ds_synced_link;
+
+ /*
+ * ds_phys->ds_<accounting> is also protected by ds_lock.
+ * Protected by ds_lock:
+ */
+ kmutex_t ds_lock;
+ void *ds_user_ptr;
+ dsl_dataset_evict_func_t *ds_user_evict_func;
+ uint64_t ds_open_refcount;
+
+ /* no locking; only for making guesses */
+ uint64_t ds_trysnap_txg;
+
+ /* Protected by ds_lock; keep at end of struct for better locality */
+ char ds_snapname[MAXNAMELEN];
+} dsl_dataset_t;
+
+#define dsl_dataset_is_snapshot(ds) \
+ ((ds)->ds_phys->ds_num_children != 0)
+
+int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_open(const char *name, int mode, void *tag,
+ dsl_dataset_t **dsp);
+int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+ const char *tail, int mode, void *tag, dsl_dataset_t **);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
+uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
+ const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
+int dsl_dataset_destroy(const char *name);
+int dsl_snapshots_destroy(char *fsname, char *snapname);
+dsl_checkfunc_t dsl_dataset_snapshot_check;
+dsl_syncfunc_t dsl_dataset_snapshot_sync;
+int dsl_dataset_rollback(dsl_dataset_t *ds);
+int dsl_dataset_rename(const char *name, const char *newname);
+int dsl_dataset_promote(const char *name);
+
+void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func);
+void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+
+blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
+void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
+void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
+void dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
+
+void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
+ dmu_tx_t *tx);
+
+int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
+
+#ifdef ZFS_DEBUG
+#define dprintf_ds(ds, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+ dsl_dataset_name(ds, __ds_name); \
+ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 000000000000..f33776ab594f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_DIR_H
+#define _SYS_DSL_DIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time; /* not actually used */
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_clone_parent_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+ /* These are immutable; no lock needed: */
+ uint64_t dd_object;
+ dsl_dir_phys_t *dd_phys;
+ dmu_buf_t *dd_dbuf;
+ dsl_pool_t *dd_pool;
+
+ /* protected by lock on pool's dp_dirty_dirs list */
+ txg_node_t dd_dirty_link;
+
+ /* protected by dp_config_rwlock */
+ dsl_dir_t *dd_parent;
+
+ /* Protected by dd_lock */
+ kmutex_t dd_lock;
+ list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+
+ /* Accounting */
+ /* reflects any changes to dd_phys->dd_used_bytes made this syncing */
+ int64_t dd_used_bytes;
+ /* gross estimate of space used by in-flight tx's */
+ uint64_t dd_tempreserved[TXG_SIZE];
+ /* amount of space we expect to write; == amount of dirty data */
+ int64_t dd_space_towrite[TXG_SIZE];
+
+ /* protected by dd_lock; keep at end of struct for better locality */
+ char dd_myname[MAXNAMELEN];
+};
+
+void dsl_dir_close(dsl_dir_t *dd, void *tag);
+int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
+int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
+ const char **tailp);
+int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_is_private(dsl_dir_t *dd);
+uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
+void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+dsl_checkfunc_t dsl_dir_destroy_check;
+dsl_syncfunc_t dsl_dir_destroy_sync;
+void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
+uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+ uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+int dsl_dir_set_quota(const char *ddname, uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+
+/* internal reserved dir name */
+#define MOS_DIR_NAME "$MOS"
+
+#ifdef ZFS_DEBUG
+#define dprintf_dd(dd, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
+ KM_SLEEP); \
+ dsl_dir_name(dd, __ds_name); \
+ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 000000000000..f7ec67a0e062
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_POOL_H
+#define _SYS_DSL_POOL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+
+typedef struct dsl_pool {
+ /* Immutable */
+ spa_t *dp_spa;
+ struct objset *dp_meta_objset;
+ struct dsl_dir *dp_root_dir;
+ struct dsl_dir *dp_mos_dir;
+ uint64_t dp_root_dir_obj;
+
+ /* No lock needed - sync context only */
+ blkptr_t dp_meta_rootbp;
+ list_t dp_synced_objsets;
+
+ /* Has its own locking */
+ tx_state_t dp_tx;
+ txg_list_t dp_dirty_datasets;
+ txg_list_t dp_dirty_dirs;
+ txg_list_t dp_sync_tasks;
+
+ /*
+ * Protects administrative changes (properties, namespace)
+ * It is only held for write in syncing context. Therefore
+ * syncing context does not need to ever have it for read, since
+ * nobody else could possibly have it for write.
+ */
+ krwlock_t dp_config_rwlock;
+} dsl_pool_t;
+
+int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_zil_clean(dsl_pool_t *dp);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 000000000000..d2debff8b8c0
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_PROP_H
+#define _SYS_DSL_PROP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+typedef struct dsl_prop_cb_record {
+ list_node_t cbr_node; /* link on dd_prop_cbs */
+ struct dsl_dataset *cbr_ds;
+ const char *cbr_propname;
+ dsl_prop_changed_cb_t *cbr_func;
+ void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_numcb(struct dsl_dataset *ds);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+
+int dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf);
+int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, const void *buf);
+
+void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
+void dsl_prop_nvlist_add_string(nvlist_t *nv,
+ zfs_prop_t prop, const char *value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_PROP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
new file mode 100644
index 000000000000..e695b182f74b
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_SYNCTASK_H
+#define _SYS_DSL_SYNCTASK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+
+typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
+
+typedef struct dsl_sync_task {
+ list_node_t dst_node;
+ dsl_checkfunc_t *dst_checkfunc;
+ dsl_syncfunc_t *dst_syncfunc;
+ void *dst_arg1;
+ void *dst_arg2;
+ int dst_err;
+} dsl_sync_task_t;
+
+typedef struct dsl_sync_task_group {
+ txg_node_t dstg_node;
+ list_t dstg_tasks;
+ struct dsl_pool *dstg_pool;
+ uint64_t dstg_txg;
+ int dstg_err;
+ int dstg_space;
+} dsl_sync_task_group_t;
+
+dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
+void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
+ dsl_checkfunc_t *, dsl_syncfunc_t *,
+ void *arg1, void *arg2, int blocks_modified);
+int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
+
+int dsl_sync_task_do(struct dsl_pool *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SYNCTASK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 000000000000..095dd3ce2464
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define _SYS_METASLAB_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct metaslab_class metaslab_class_t;
+typedef struct metaslab_group metaslab_group_t;
+
+extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ uint64_t start, uint64_t size, uint64_t txg);
+extern void metaslab_fini(metaslab_t *msp);
+extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+
+extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
+ int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
+extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
+ boolean_t now);
+extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
+
+extern metaslab_class_t *metaslab_class_create(void);
+extern void metaslab_class_destroy(metaslab_class_t *mc);
+extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
+extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+
+extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
+ vdev_t *vd);
+extern void metaslab_group_destroy(metaslab_group_t *mg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 000000000000..5980cbc843ac
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define _SYS_METASLAB_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct metaslab_class {
+ metaslab_group_t *mc_rotor;
+ uint64_t mc_allocated;
+};
+
+struct metaslab_group {
+ kmutex_t mg_lock;
+ avl_tree_t mg_metaslab_tree;
+ uint64_t mg_aliquot;
+ int64_t mg_bias;
+ metaslab_class_t *mg_class;
+ vdev_t *mg_vd;
+ metaslab_group_t *mg_prev;
+ metaslab_group_t *mg_next;
+};
+
+/*
+ * Each metaslab's free space is tracked in space map object in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
+ * we append the allocs and frees from that txg to the space map object.
+ * When the txg is done syncing, metaslab_sync_done() updates ms_smo
+ * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ */
+struct metaslab {
+ kmutex_t ms_lock; /* metaslab lock */
+ space_map_obj_t ms_smo; /* synced space map object */
+ space_map_obj_t ms_smo_syncing; /* syncing space map object */
+ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
+ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
+ space_map_t ms_map; /* in-core free space map */
+ uint64_t ms_weight; /* weight vs. others in group */
+ metaslab_group_t *ms_group; /* metaslab group */
+ avl_node_t ms_group_node; /* node in metaslab group tree */
+ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 000000000000..4de1cae12fe7
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_REFCOUNT_H
+#define _SYS_REFCOUNT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define FTAG ((char *)__func__)
+
+#if defined(DEBUG) || !defined(_KERNEL)
+typedef struct reference {
+ list_node_t ref_link;
+ void *ref_holder;
+ uint64_t ref_number;
+ uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+ kmutex_t rc_mtx;
+ list_t rc_list;
+ list_t rc_removed;
+ int64_t rc_count;
+ int64_t rc_removed_count;
+} refcount_t;
+
+/* Note: refcount_t should be initialized to zero before use. */
+
+void refcount_create(refcount_t *rc);
+void refcount_destroy(refcount_t *rc);
+void refcount_destroy_many(refcount_t *rc, uint64_t number);
+int refcount_is_zero(refcount_t *rc);
+int64_t refcount_count(refcount_t *rc);
+int64_t refcount_add(refcount_t *rc, void *holder_tag);
+int64_t refcount_remove(refcount_t *rc, void *holder_tag);
+int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
+int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+
+void refcount_init(void);
+void refcount_fini(void);
+
+#else /* DEBUG */
+
+typedef struct refcount {
+ uint64_t rc_count;
+} refcount_t;
+
+#define refcount_create(rc) ((rc)->rc_count = 0)
+#define refcount_destroy(rc) ((rc)->rc_count = 0)
+#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define refcount_count(rc) ((rc)->rc_count)
+#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
+#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define refcount_add_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, number)
+#define refcount_remove_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, -number)
+
+#define refcount_init()
+#define refcount_fini()
+
+#endif /* DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 000000000000..2bcf4c8a3299
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,491 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_H
+#define _SYS_SPA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct zilog zilog_t;
+typedef struct traverse_handle traverse_handle_t;
+struct dsl_pool;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) \
+ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
+#define BF64_SET(x, low, len, val) \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF64_SET_SB(x, low, len, shift, bias, val) \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_MAXBLOCKSHIFT 17
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * E endianness
+ * type DMU object type
+ * lvl level of indirection
+ * birth txg transaction group in which the block was born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+ dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[3]; /* Extra space for the future */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ (BP_IS_HOLE(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_PSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_GET_ASIZE(bp) \
+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_GET_UCSIZE(bp) \
+ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+#define BP_GET_NDVAS(bp) \
+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_COUNT_GANG(bp) \
+ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[2]))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_pad[2] = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define ZFS_HOST_BYTEORDER (0ULL)
+#else
+#define ZFS_HOST_BYTEORDER (-1ULL)
+#endif
+
+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#define BP_SPRINTF_LEN 320
+
+#include <sys/dmu.h>
+
+#define BP_GET_BUFC_TYPE(bp) \
+ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
+/*
+ * Routines found in spa.c
+ */
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_get_stats(const char *pool, nvlist_t **config,
+ char *altroot, size_t buflen);
+extern int spa_create(const char *pool, nvlist_t *config, const char *altroot);
+extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+
+#define SPA_ASYNC_REOPEN 0x01
+#define SPA_ASYNC_REPLACE_DONE 0x02
+#define SPA_ASYNC_SCRUB 0x04
+#define SPA_ASYNC_RESILVER 0x08
+#define SPA_ASYNC_CONFIG_UPDATE 0x10
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
+ int replacing);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
+extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+
+/* spare state (which is global across all pools) */
+extern void spa_spare_add(vdev_t *vd);
+extern void spa_spare_remove(vdev_t *vd);
+extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool);
+extern void spa_spare_activate(vdev_t *vd);
+
+/* scrubbing */
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
+extern void spa_scrub_suspend(spa_t *spa);
+extern void spa_scrub_resume(spa_t *spa);
+extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+
+#define SPA_CONFIG_UPDATE_POOL 0
+#define SPA_CONFIG_UPDATE_VDEVS 1
+
+extern void spa_config_sync(void);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int getstats);
+extern void spa_config_update(spa_t *spa, int what);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name, const char *altroot);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+/* Pool configuration lock */
+extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
+extern void spa_config_exit(spa_t *spa, void *tag);
+extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Accessor functions */
+extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
+extern int spa_traverse_wanted(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+struct metaslab_class;
+extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
+extern uint64_t spa_get_alloc(spa_t *spa);
+extern uint64_t spa_get_space(spa_t *spa);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
+extern int spa_busy(void);
+
+/* Miscellaneous support routines */
+extern int spa_rename(const char *oldname, const char *newname);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern void spa_upgrade(spa_t *spa);
+extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
+extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
+extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+
+/* history logging */
+extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
+extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
+ char *his_buf);
+extern int spa_history_log(spa_t *spa, const char *his_buf,
+ uint64_t pool_create);
+
+/* error handling */
+struct zbookmark;
+struct zio;
+extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+
+/* properties */
+extern int spa_set_props(spa_t *spa, nvlist_t *nvp);
+extern int spa_get_props(spa_t *spa, nvlist_t **nvp);
+extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+extern boolean_t spa_has_bootfs(spa_t *spa);
+
+#ifdef ZFS_DEBUG
+#define dprintf_bp(bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
+ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode; /* mode, e.g. FREAD | FWRITE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 000000000000..8c57123ad4b8
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define _SYS_SPA_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ refcount_t scl_count;
+ kthread_t *scl_writer;
+ kcondvar_t scl_cv;
+} spa_config_lock_t;
+
+typedef struct spa_error_entry {
+ zbookmark_t se_bookmark;
+ char *se_name;
+ avl_node_t se_avl;
+} spa_error_entry_t;
+
+typedef struct spa_history_phys {
+ uint64_t sh_pool_create_len; /* ending offset of zpool create */
+ uint64_t sh_phys_max_off; /* physical EOF */
+ uint64_t sh_bof; /* logical BOF */
+ uint64_t sh_eof; /* logical EOF */
+ uint64_t sh_records_lost; /* num of records overwritten */
+} spa_history_phys_t;
+
+typedef struct spa_props {
+ nvlist_t *spa_props_nvp;
+ list_node_t spa_list_node;
+} spa_props_t;
+
+struct spa {
+ /*
+ * Fields protected by spa_namespace_lock.
+ */
+ char *spa_name; /* pool name */
+ avl_node_t spa_avl; /* node in spa_namespace_avl */
+ nvlist_t *spa_config; /* last synced config */
+ nvlist_t *spa_config_syncing; /* currently syncing config */
+ uint64_t spa_config_txg; /* txg of last config change */
+ kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */
+ int spa_sync_pass; /* iterate-to-convergence */
+ int spa_state; /* pool state */
+ int spa_inject_ref; /* injection references */
+ uint8_t spa_traverse_wanted; /* traverse lock wanted */
+ uint8_t spa_sync_on; /* sync threads are running */
+ spa_load_state_t spa_load_state; /* current load operation */
+ taskq_t *spa_zio_issue_taskq[ZIO_TYPES];
+ taskq_t *spa_zio_intr_taskq[ZIO_TYPES];
+ dsl_pool_t *spa_dsl_pool;
+ metaslab_class_t *spa_normal_class; /* normal data class */
+ uint64_t spa_first_txg; /* first txg after spa_open() */
+ uint64_t spa_final_txg; /* txg of export/destroy */
+ uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
+ vdev_t *spa_root_vdev; /* top-level vdev container */
+ uint64_t spa_load_guid; /* initial guid for spa_load */
+ list_t spa_dirty_list; /* vdevs with dirty labels */
+ uint64_t spa_spares_object; /* MOS object for spare list */
+ nvlist_t *spa_sparelist; /* cached spare config */
+ vdev_t **spa_spares; /* available hot spares */
+ int spa_nspares; /* number of hot spares */
+ boolean_t spa_sync_spares; /* sync the spares list */
+ uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_syncing_txg; /* txg currently syncing */
+ uint64_t spa_sync_bplist_obj; /* object for deferred frees */
+ bplist_t spa_sync_bplist; /* deferred-free bplist */
+ krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */
+ uberblock_t spa_ubsync; /* last synced uberblock */
+ uberblock_t spa_uberblock; /* current uberblock */
+ kmutex_t spa_scrub_lock; /* resilver/scrub lock */
+ kthread_t *spa_scrub_thread; /* scrub/resilver thread */
+ traverse_handle_t *spa_scrub_th; /* scrub traverse handle */
+ uint64_t spa_scrub_restart_txg; /* need to restart */
+ uint64_t spa_scrub_mintxg; /* min txg we'll scrub */
+ uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */
+ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
+ uint64_t spa_scrub_errors; /* scrub I/O error count */
+ int spa_scrub_suspended; /* tell scrubber to suspend */
+ kcondvar_t spa_scrub_cv; /* scrub thread state change */
+ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
+ uint8_t spa_scrub_stop; /* tell scrubber to stop */
+ uint8_t spa_scrub_active; /* active or suspended? */
+ uint8_t spa_scrub_type; /* type of scrub we're doing */
+ uint8_t spa_scrub_finished; /* indicator to rotate logs */
+ kmutex_t spa_async_lock; /* protect async state */
+ kthread_t *spa_async_thread; /* thread doing async task */
+ int spa_async_suspended; /* async tasks suspended */
+ kcondvar_t spa_async_cv; /* wait for thread_exit() */
+ uint16_t spa_async_tasks; /* async task mask */
+ char *spa_root; /* alternate root directory */
+ kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */
+ uint64_t spa_ena; /* spa-wide ereport ENA */
+ boolean_t spa_last_open_failed; /* true if last open faled */
+ kmutex_t spa_errlog_lock; /* error log lock */
+ uint64_t spa_errlog_last; /* last error log object */
+ uint64_t spa_errlog_scrub; /* scrub error log object */
+ kmutex_t spa_errlist_lock; /* error list/ereport lock */
+ avl_tree_t spa_errlist_last; /* last error list */
+ avl_tree_t spa_errlist_scrub; /* scrub error list */
+ uint64_t spa_deflate; /* should we deflate? */
+ uint64_t spa_history; /* history object */
+ kmutex_t spa_history_lock; /* history lock */
+ vdev_t *spa_pending_vdev; /* pending vdev additions */
+ nvlist_t **spa_pending_spares; /* pending spare additions */
+ uint_t spa_pending_nspares; /* # pending spares */
+ kmutex_t spa_props_lock; /* property lock */
+ uint64_t spa_pool_props_object; /* object for properties */
+ uint64_t spa_bootfs; /* default boot filesystem */
+ /*
+ * spa_refcnt must be the last element because it changes size based on
+ * compilation options. In order for the MDB module to function
+ * correctly, the other fields must remain in the same location.
+ */
+ spa_config_lock_t spa_config_lock; /* configuration changes */
+ refcount_t spa_refcount; /* number of opens */
+};
+
+extern const char *spa_config_dir;
+extern kmutex_t spa_namespace_lock;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 000000000000..db9daef1f156
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define _SYS_SPACE_MAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct space_map_ops space_map_ops_t;
+
+typedef struct space_map {
+ avl_tree_t sm_root; /* AVL tree of map segments */
+ uint64_t sm_space; /* sum of all segments in the map */
+ uint64_t sm_start; /* start of map */
+ uint64_t sm_size; /* size of map */
+ uint8_t sm_shift; /* unit shift */
+ uint8_t sm_pad[3]; /* unused */
+ uint8_t sm_loaded; /* map loaded? */
+ uint8_t sm_loading; /* map loading? */
+ kcondvar_t sm_load_cv; /* map load completion */
+ space_map_ops_t *sm_ops; /* space map block picker ops vector */
+ void *sm_ppd; /* picker-private data */
+ kmutex_t *sm_lock; /* pointer to lock that protects map */
+} space_map_t;
+
+typedef struct space_seg {
+ avl_node_t ss_node; /* AVL node */
+ uint64_t ss_start; /* starting offset of this segment */
+ uint64_t ss_end; /* ending offset (non-inclusive) */
+} space_seg_t;
+
+typedef struct space_map_obj {
+ uint64_t smo_object; /* on-disk space map object */
+ uint64_t smo_objsize; /* size of the object */
+ uint64_t smo_alloc; /* space allocated from the map */
+} space_map_obj_t;
+
+struct space_map_ops {
+ void (*smop_load)(space_map_t *sm);
+ void (*smop_unload)(space_map_t *sm);
+ uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
+ void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
+ void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+};
+
+/*
+ * debug entry
+ *
+ * 1 3 10 50
+ * ,---+--------+------------+---------------------------------.
+ * | 1 | action | syncpass | txg (lower bits) |
+ * `---+--------+------------+---------------------------------'
+ * 63 62 60 59 50 49 0
+ *
+ *
+ *
+ * non-debug entry
+ *
+ * 1 47 1 15
+ * ,-----------------------------------------------------------.
+ * | 0 | offset (sm_shift units) | type | run |
+ * `-----------------------------------------------------------'
+ * 63 62 17 16 15 0
+ */
+
+/* All this stuff takes and returns bytes */
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
+#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
+#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
+
+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
+
+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
+
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+
+#define SM_ALLOC 0x0
+#define SM_FREE 0x1
+
+/*
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ * This could use a lot more research, but for now, set the freelist
+ * block size to 4k (2^12).
+ */
+#define SPACE_MAP_BLOCKSHIFT 12
+
+typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
+ uint8_t shift, kmutex_t *lp);
+extern void space_map_destroy(space_map_t *sm);
+extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
+extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_vacate(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_walk(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_union(space_map_t *smd, space_map_t *sms);
+
+extern void space_map_load_wait(space_map_t *sm);
+extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
+ uint8_t maptype, space_map_obj_t *smo, objset_t *os);
+extern void space_map_unload(space_map_t *sm);
+
+extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
+extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_sync(space_map_t *sm, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
+extern void space_map_truncate(space_map_obj_t *smo,
+ objset_t *os, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPACE_MAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 000000000000..dae129c2e5a4
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_H
+#define _SYS_TXG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
+#define TXG_SIZE 4 /* next power of 2 */
+#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
+#define TXG_INITIAL TXG_SIZE /* initial txg */
+#define TXG_IDX (txg & TXG_MASK)
+
+#define TXG_WAIT 1ULL
+#define TXG_NOWAIT 2ULL
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+ tx_cpu_t *th_cpu;
+ uint64_t th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+ struct txg_node *tn_next[TXG_SIZE];
+ uint8_t tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+ kmutex_t tl_lock;
+ size_t tl_offset;
+ txg_node_t *tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_suspend(struct dsl_pool *dp);
+extern void txg_resume(struct dsl_pool *dp);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately). If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group. Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern int txg_stalled(struct dsl_pool *dp);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define TXG_CLEAN(txg) ((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 000000000000..45a138afaac3
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define _SYS_TXG_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct tx_cpu {
+ kmutex_t tc_lock;
+ kcondvar_t tc_cv[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE];
+ char tc_pad[16];
+};
+
+typedef struct tx_state {
+ tx_cpu_t *tx_cpu; /* protects right to enter txg */
+ kmutex_t tx_sync_lock; /* protects tx_state_t */
+ krwlock_t tx_suspend;
+ uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
+ uint64_t tx_syncing_txg; /* currently syncing txg id */
+ uint64_t tx_synced_txg; /* last synced txg id */
+
+ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
+ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+ kcondvar_t tx_sync_more_cv;
+ kcondvar_t tx_sync_done_cv;
+ kcondvar_t tx_quiesce_more_cv;
+ kcondvar_t tx_quiesce_done_cv;
+ kcondvar_t tx_timeout_exit_cv;
+ kcondvar_t tx_exit_cv; /* wait for all threads to exit */
+
+ uint8_t tx_threads; /* number of threads */
+ uint8_t tx_exiting; /* set when we're exiting */
+
+ kthread_t *tx_sync_thread;
+ kthread_t *tx_quiesce_thread;
+ kthread_t *tx_timelimit_thread;
+} tx_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 000000000000..93d936ae4b18
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define _SYS_UBERBLOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *ub);
+extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 000000000000..ab0f2dcf8c1b
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define _SYS_UBERBLOCK_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/uberblock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+#define UBERBLOCK_SHIFT 10 /* up to 1K */
+
+struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* ZFS_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 000000000000..c8c177e3ca6c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UNIQUE_H
+#define _SYS_UNIQUE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define UNIQUE_BITS 56
+
+void unique_init(void);
+
+/* Return a new unique value. */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 000000000000..31208116256d
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,132 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_H
+#define _SYS_VDEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern boolean_t zfs_nocacheflush;
+
+/*
+ * Fault injection modes.
+ */
+#define VDEV_FAULT_NONE 0
+#define VDEV_FAULT_RANDOM 1
+#define VDEV_FAULT_COUNT 2
+
+extern int vdev_open(vdev_t *);
+extern int vdev_validate(vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
+extern void vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *);
+extern int vdev_validate_spare(vdev_t *);
+
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
+extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ int scrub_done);
+
+extern const char *vdev_description(vdev_t *vd);
+
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_stat_update(zio_t *zio);
+extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
+ boolean_t complete);
+extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_propagate_state(vdev_t *vd);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+ vdev_aux_t aux);
+
+extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
+ int64_t alloc_delta);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern void vdev_io_start(zio_t *zio);
+extern void vdev_io_done(zio_t *zio);
+
+extern int vdev_online(spa_t *spa, uint64_t guid);
+extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
+
+extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
+extern int vdev_is_dead(vdev_t *vd);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern int vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
+
+extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
+ boolean_t getstats, boolean_t isspare);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd);
+extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+
+typedef enum {
+ VDEV_LABEL_CREATE, /* create/add a new device */
+ VDEV_LABEL_REPLACE, /* replace an existing device */
+ VDEV_LABEL_SPARE, /* add a new hot spare */
+ VDEV_LABEL_REMOVE /* remove an existing device */
+} vdev_labeltype_t;
+
+extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 000000000000..95536a77db9a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define _SYS_VDEV_DISK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_disk {
+ ddi_devid_t vd_devid;
+ char *vd_minor;
+ ldi_handle_t vd_lh;
+} vdev_disk_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_DISK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 000000000000..cd496735778c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define _SYS_VDEV_FILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+ vnode_t *vf_vnode;
+} vdev_file_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_FILE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 000000000000..aba756713f9c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define _SYS_VDEV_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+
+/*
+ * Virtual device operations
+ */
+typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef void vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef void vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_done_func_t(zio_t *zio);
+typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+
+typedef struct vdev_ops {
+ vdev_open_func_t *vdev_op_open;
+ vdev_close_func_t *vdev_op_close;
+ vdev_asize_func_t *vdev_op_asize;
+ vdev_io_start_func_t *vdev_op_io_start;
+ vdev_io_done_func_t *vdev_op_io_done;
+ vdev_state_change_func_t *vdev_op_state_change;
+ char vdev_op_type[16];
+ boolean_t vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+ char *ve_data;
+ uint64_t ve_offset;
+ uint64_t ve_lastused;
+ avl_node_t ve_offset_node;
+ avl_node_t ve_lastused_node;
+ uint32_t ve_hits;
+ uint16_t ve_missed_update;
+ zio_t *ve_fill_io;
+};
+
+struct vdev_cache {
+ avl_tree_t vc_offset_tree;
+ avl_tree_t vc_lastused_tree;
+ kmutex_t vc_lock;
+};
+
+struct vdev_queue {
+ avl_tree_t vq_deadline_tree;
+ avl_tree_t vq_read_tree;
+ avl_tree_t vq_write_tree;
+ avl_tree_t vq_pending_tree;
+ kmutex_t vq_lock;
+};
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+ /*
+ * Common to all vdev types.
+ */
+ uint64_t vdev_id; /* child number in vdev parent */
+ uint64_t vdev_guid; /* unique ID for this vdev */
+ uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_ashift; /* block alignment shift */
+ uint64_t vdev_state; /* see VDEV_STATE_* #defines */
+ uint64_t vdev_prevstate; /* used when reopening a vdev */
+ vdev_ops_t *vdev_ops; /* vdev operations */
+ spa_t *vdev_spa; /* spa for this vdev */
+ void *vdev_tsd; /* type-specific data */
+ vdev_t *vdev_top; /* top-level vdev */
+ vdev_t *vdev_parent; /* parent vdev */
+ vdev_t **vdev_child; /* array of children */
+ uint64_t vdev_children; /* number of children */
+ space_map_t vdev_dtl_map; /* dirty time log in-core state */
+ space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */
+ vdev_stat_t vdev_stat; /* virtual device statistics */
+
+ /*
+ * Top-level vdev state.
+ */
+ uint64_t vdev_ms_array; /* metaslab array object */
+ uint64_t vdev_ms_shift; /* metaslab size shift */
+ uint64_t vdev_ms_count; /* number of metaslabs */
+ metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_t **vdev_ms; /* metaslab array */
+ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
+ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
+ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
+ uint8_t vdev_reopen_wanted; /* async reopen wanted? */
+ list_node_t vdev_dirty_node; /* config dirty list */
+ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
+
+ /*
+ * Leaf vdev state.
+ */
+ uint64_t vdev_psize; /* physical device capacity */
+ space_map_obj_t vdev_dtl; /* dirty time log on-disk state */
+ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
+ uint64_t vdev_wholedisk; /* true if this is a whole disk */
+ uint64_t vdev_offline; /* device taken offline? */
+ uint64_t vdev_nparity; /* number of parity devices for raidz */
+ char *vdev_path; /* vdev path (if any) */
+ char *vdev_devid; /* vdev devid (if any) */
+ uint64_t vdev_fault_arg; /* fault injection paramater */
+ int vdev_fault_mask; /* zio types to fault */
+ uint8_t vdev_fault_mode; /* fault injection mode */
+ uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */
+ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
+ uint8_t vdev_detached; /* device detached? */
+ uint64_t vdev_isspare; /* was a hot spare */
+ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
+ vdev_cache_t vdev_cache; /* physical block cache */
+ uint64_t vdev_not_present; /* not present during import */
+ hrtime_t vdev_last_try; /* last reopen time */
+ boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+
+ /*
+ * For DTrace to work in userland (libzpool) context, these fields must
+ * remain at the end of the structure. DTrace will use the kernel's
+ * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+ * larger in userland, the offsets for the rest fields would be
+ * incorrect.
+ */
+ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
+ kmutex_t vdev_stat_lock; /* vdev_stat */
+};
+
+#define VDEV_SKIP_SIZE (8 << 10)
+#define VDEV_BOOT_HEADER_SIZE (8 << 10)
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCK_RING (128 << 10)
+
+#define VDEV_UBERBLOCK_SHIFT(vd) \
+ MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+#define VDEV_UBERBLOCK_COUNT(vd) \
+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
+
+/* ZFS boot block */
+#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL
+#define VDEV_BOOT_VERSION 1 /* version number */
+
+typedef struct vdev_boot_header {
+ uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
+ uint64_t vb_version; /* VDEV_BOOT_VERSION */
+ uint64_t vb_offset; /* start offset (bytes) */
+ uint64_t vb_size; /* size (bytes) */
+ char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+ zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+ char vl_pad[VDEV_SKIP_SIZE]; /* 8K */
+ vdev_boot_header_t vl_boot_header; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 112K */
+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * vdev_dirty() flags
+ */
+#define VDD_METASLAB 0x01
+#define VDD_DTL 0x02
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+
+#define VDEV_ALLOC_LOAD 0
+#define VDEV_ALLOC_ADD 1
+#define VDEV_ALLOC_SPARE 2
+
+/*
+ * Allocate or free a vdev
+ */
+extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
+ vdev_t *parent, uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern void vdev_load(vdev_t *vd);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+#ifdef _KERNEL
+extern vdev_ops_t vdev_geom_ops;
+#else
+extern vdev_ops_t vdev_disk_ops;
+extern vdev_ops_t vdev_file_ops;
+#endif
+extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_spare_ops;
+
+/*
+ * Common size functions
+ */
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+extern uint64_t vdev_get_rsize(vdev_t *vd);
+
+/*
+ * zdb uses this tunable, so it must be declared here to make lint happy.
+ */
+extern int zfs_vdev_cache_size;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 000000000000..f89d9385ea38
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,359 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_H
+#define _SYS_ZAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Managemnt
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs. The name is
+ * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
+ * terminating NULL). The value is an array of integers, which may be
+ * 1, 2, 4, or 8 bytes long. The total space used by the array (number
+ * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
+ * Note that an 8-byte integer value can be used to store the location
+ * (object number) of another dmu object (which may be itself a zapobj).
+ * Note that you can use a zero-length attribute to store a single bit
+ * of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe. However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (49 bytes or less) names and single 8-byte values, for which
+ * the microzap will be used. The ZAP should be efficient enough so
+ * that the user does not need to cache these attributes.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe. Operations
+ * on different zapobjs will be processed concurrently. Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 128 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length. If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZAP_MAXNAMELEN 256
+#define ZAP_MAXVALUELEN 1024
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0. * When converting to a
+ * larger integer size, the integers will be treated as unsigned (ie. no
+ * sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'. If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value. If an
+ * attribute with the given name does not exist, it will be created. If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten. The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+
+/*
+ * Returns (in name) the name of the entry whose value
+ * (za_first_integer) is value, or ENOENT if not found. The string
+ * pointed to by name must be at least 256 bytes long.
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
+
+struct zap;
+struct zap_leaf;
+typedef struct zap_cursor {
+ /* This structure is opaque! */
+ objset_t *zc_objset;
+ struct zap *zc_zap;
+ struct zap_leaf *zc_leaf;
+ uint64_t zc_zapobj;
+ uint64_t zc_hash;
+ uint32_t zc_cd;
+} zap_cursor_t;
+
+typedef struct {
+ int za_integer_length;
+ uint64_t za_num_integers;
+ uint64_t za_first_integer; /* no sign extension for <8byte ints */
+ char za_name[MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one. The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj. You must _fini the cursor when you are done with it.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_fini(zap_cursor_t *zc);
+
+/*
+ * Get the attribute currently pointed to by the cursor. Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor. The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value. The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument). You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+ uint64_t zapobj, uint64_t serialized);
+
+
+#define ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+ /*
+ * Size of the pointer table (in number of entries).
+ * This is always a power of 2, or zero if it's a microzap.
+ * In general, it should be considerably greater than zs_num_leafs.
+ */
+ uint64_t zs_ptrtbl_len;
+
+ uint64_t zs_blocksize; /* size of zap blocks */
+
+ /*
+ * The number of blocks used. Note that some blocks may be
+ * wasted because old ptrtbl's and large name/value blocks are
+ * not reused. (Although their space is reclaimed, we don't
+ * reuse those offsets in the object.)
+ */
+ uint64_t zs_num_blocks;
+
+ /*
+ * Pointer table values from zap_ptrtbl in the zap_phys_t
+ */
+ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
+ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
+ uint64_t zs_ptrtbl_zt_blk; /* starting block number */
+ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
+ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
+
+ /*
+ * Values of the other members of the zap_phys_t
+ */
+ uint64_t zs_block_type; /* ZBT_HEADER */
+ uint64_t zs_magic; /* ZAP_MAGIC */
+ uint64_t zs_num_leafs; /* The number of leaf blocks */
+ uint64_t zs_num_entries; /* The number of zap entries */
+ uint64_t zs_salt; /* salt to stir into hash function */
+
+ /*
+ * Histograms. For all histograms, the last index
+ * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+ * than what can be represented. For example
+ * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+ * of leafs with more than 45 entries.
+ */
+
+ /*
+ * zs_leafs_with_n_pointers[n] is the number of leafs with
+ * 2^n pointers to it.
+ */
+ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_with_n_entries[n] is the number of leafs with
+ * [n*5, (n+1)*5) entries. In the current implementation, there
+ * can be at most 55 entries in any block, but there may be
+ * fewer if the name or value is large, or the block is not
+ * completely full.
+ */
+ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_n_tenths_full[n] is the number of leafs whose
+ * fullness is in the range [n/10, (n+1)/10).
+ */
+ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_entries_using_n_chunks[n] is the number of entries which
+ * consume n 24-byte chunks. (Note, large names/values only use
+ * one chunk, but contribute to zs_num_blocks_large.)
+ */
+ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_buckets_with_n_entries[n] is the number of buckets (each
+ * leaf has 64 buckets) with n entries.
+ * zs_buckets_with_n_entries[1] should be very close to
+ * zs_num_entries.
+ */
+ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object. Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics. This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 000000000000..4e43f4ae49a1
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_IMPL_H
+#define _SYS_ZAP_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int fzap_default_block_shift;
+
+#define ZAP_MAGIC 0x2F52AB2ABULL
+
+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
+
+#define ZAP_MAXCD (uint32_t)(-1)
+#define ZAP_HASHBITS 28
+#define MZAP_ENT_LEN 64
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
+#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_pad[6];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+ avl_node_t mze_node;
+ int mze_chunkid;
+ uint64_t mze_hash;
+ mzap_ent_phys_t mze_phys;
+} mzap_ent_t;
+
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block. Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+ ((uint64_t *)(zap)->zap_f.zap_phys) \
+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ /*
+ * This structure is followed by padding, and then the embedded
+ * pointer table. The embedded pointer table takes up second
+ * half of the block. It is accessed using the
+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+ */
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+ objset_t *zap_objset;
+ uint64_t zap_object;
+ struct dmu_buf *zap_dbuf;
+ krwlock_t zap_rwlock;
+ int zap_ismicro;
+ uint64_t zap_salt;
+ union {
+ struct {
+ zap_phys_t *zap_phys;
+
+ /*
+ * zap_num_entries_mtx protects
+ * zap_num_entries
+ */
+ kmutex_t zap_num_entries_mtx;
+ int zap_block_shift;
+ } zap_fat;
+ struct {
+ mzap_phys_t *zap_phys;
+ int16_t zap_num_entries;
+ int16_t zap_num_chunks;
+ int16_t zap_alloc_next;
+ avl_tree_t zap_avl;
+ } zap_micro;
+ } zap_u;
+} zap_t;
+
+#define zap_f zap_u.zap_fat
+#define zap_m zap_u.zap_micro
+
+uint64_t zap_hash(zap_t *zap, const char *name);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, int fatreader, zap_t **zapp);
+void zap_unlockdir(zap_t *zap);
+void zap_evict(dmu_buf_t *db, void *vmzap);
+
+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+int fzap_add(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int fzap_update(zap_t *zap, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int fzap_length(zap_t *zap, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+void zap_put_leaf(struct zap_leaf *l);
+
+int fzap_add_cd(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 000000000000..147fb7212454
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,234 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_LEAF_H
+#define _SYS_ZAP_LEAF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zap;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space available for chunks is:
+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define ZAP_LEAF_NUMCHUNKS(l) \
+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+ ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * Low water mark: when there are only this many chunks free, start
+ * growing the ptrtbl. Ideally, this should be larger than a
+ * "reasonably-sized" entry. 20 chunks is more than enough for the
+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
+ * while still being only around 3% for 16k blocks.
+ */
+#define ZAP_LEAF_LOW_WATER (20)
+
+/*
+ * The leaf hash table has block size / 2^5 (32) number of entries,
+ * which should be more than enough for the maximum number of entries,
+ * which is less than block size / CHUNKSIZE (24) / minimum number of
+ * chunks per entry (3).
+ */
+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+
+/*
+ * The chunks start immediately after the hash table. The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define ZAP_LEAF_CHUNK(l, idx) \
+ ((zap_leaf_chunk_t *) \
+ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
+
+typedef enum zap_chunk_type {
+ ZAP_CHUNK_FREE = 253,
+ ZAP_CHUNK_ENTRY = 252,
+ ZAP_CHUNK_ARRAY = 251,
+ ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ uint64_t lh_block_type; /* ZBT_LEAF */
+ uint64_t lh_pad1;
+ uint64_t lh_prefix; /* hash prefix of this leaf */
+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lh_nfree; /* number free chunks */
+ uint16_t lh_nentries; /* number of entries */
+ uint16_t lh_prefix_len; /* num bits used to id this */
+
+/* above is accessable to zap, below is zap_leaf private */
+
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_pad2[12];
+ } l_hdr; /* 2 24-byte chunks */
+
+ /*
+ * The header is followed by a hash table with
+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+ * zap_leaf_chunk structures. These structures are accessed
+ * with the ZAP_LEAF_CHUNK() macro.
+ */
+
+ uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
+ uint8_t le_int_size; /* size of ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_length; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+} zap_leaf_chunk_t;
+
+typedef struct zap_leaf {
+ krwlock_t l_rwlock; /* only used on head of chain */
+ uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
+ int l_bs; /* block size shift */
+ dmu_buf_t *l_dbuf;
+ zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+
+typedef struct zap_entry_handle {
+ /* below is set by zap_leaf.c and is public to zap.c */
+ uint64_t zeh_num_integers;
+ uint64_t zeh_hash;
+ uint32_t zeh_cd;
+ uint8_t zeh_integer_size;
+
+ /* below is private to zap_leaf.c */
+ uint16_t zeh_fakechunk;
+ uint16_t *zeh_chunkp;
+ zap_leaf_t *zeh_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found. The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+ const char *name, uint64_t h, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute. Integer size
+ * conversion will be done without sign extension. Return EINVAL if
+ * integer_size is too small. Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * zap_entry_update may fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value). Fills in the
+ * entry handle on success. Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l,
+ const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl);
+extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 000000000000..3250b760fb07
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_ACL_H
+#define _SYS_FS_ZFS_ACL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/cred.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define ACCESS_UNDETERMINED -1
+
+#define ACE_SLOT_CNT 6
+
+typedef struct zfs_znode_acl {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_pad; /* pad */
+ ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+#define ACL_DATA_ALLOCED 0x1
+
+/*
+ * Max ACL size is prepended deny for all entries + the
+ * canonical six tacked on * the end.
+ */
+#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6)
+
+typedef struct zfs_acl {
+ int z_slots; /* number of allocated slots for ACEs */
+ int z_acl_count;
+ uint_t z_state;
+ ace_t *z_acl;
+} zfs_acl_t;
+
+#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define ZFS_ACL_DISCARD 0
+#define ZFS_ACL_NOALLOW 1
+#define ZFS_ACL_GROUPMASK 2
+#define ZFS_ACL_PASSTHROUGH 3
+#define ZFS_ACL_SECURE 4
+
+struct znode;
+
+#ifdef _KERNEL
+void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
+ dmu_tx_t *, cred_t *);
+#ifdef TODO
+int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
+#endif
+int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *);
+#ifdef TODO
+int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
+#endif
+void zfs_acl_rele(void *);
+void zfs_ace_byteswap(ace_t *, int);
+extern int zfs_zaccess(struct znode *, int, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+ struct znode *, struct znode *, cred_t *cr);
+int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
+void zfs_acl_free(zfs_acl_t *);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZFS_NO_ACL */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 000000000000..c91d80764fad
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define _SYS_ZFS_CONTEXT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/stdint.h>
+#include <sys/note.h>
+#include <sys/kernel.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/systm.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/string.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/cred.h>
+#include <sys/sdt.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/sysctl.h>
+#include <sys/sbuf.h>
+#include <sys/priv.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/stack.h>
+#include <sys/lockf.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/eventhandler.h>
+#include <sys/zfs_debug.h>
+
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */
+#ifdef min_offset
+#undef min_offset
+#endif
+#ifdef max_offset
+#undef max_offset
+#endif
+#include <vm/vm_extern.h>
+#include <vm/vnode_pager.h>
+
+#define CPU_SEQID (curcpu)
+
+#ifdef __cplusplus
+}
+#endif
+
+#define physmem (vm_kmem_size / PAGE_SIZE)
+
+extern int zfs_debug_level;
+extern struct mtx zfs_debug_mtx;
+#define ZFS_LOG(lvl, ...) do { \
+ if (((lvl) & 0xff) <= zfs_debug_level) { \
+ mtx_lock(&zfs_debug_mtx); \
+ printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \
+ printf(__VA_ARGS__); \
+ printf("\n"); \
+ if ((lvl) & 0x100) \
+ kdb_backtrace(); \
+ mtx_unlock(&zfs_debug_mtx); \
+ } \
+} while (0)
+
+#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 000000000000..a676533ac4a2
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_CTLDIR_H
+#define _ZFS_CTLDIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_CTLDIR_NAME ".zfs"
+
+#define zfs_has_ctldir(zdp) \
+ ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+ ((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define zfs_show_ctldir(zdp) \
+ (zfs_has_ctldir(zdp) && \
+ ((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+vnode_t *zfsctl_root(znode_t *);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr);
+
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define ZFSCTL_INO_ROOT 0x1
+#define ZFSCTL_INO_SNAPDIR 0x2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_CTLDIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 000000000000..450ac1c81b42
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define _SYS_ZFS_DEBUG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#define ZFS_DEBUG
+#endif
+
+extern int zfs_flags;
+
+#define ZFS_DEBUG_DPRINTF 0x0001
+#define ZFS_DEBUG_DBUF_VERIFY 0x0002
+#define ZFS_DEBUG_DNODE_VERIFY 0x0004
+#define ZFS_DEBUG_SNAPNAMES 0x0008
+#define ZFS_DEBUG_MODIFY 0x0010
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+ int line, const char *fmt, ...);
+#define dprintf(...) \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+extern void zfs_panic_recover(const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_DEBUG_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 000000000000..f60d614953f3
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_DIR_H
+#define _SYS_FS_ZFS_DIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define ZNEW 0x0001 /* entry should not exist */
+#define ZEXISTS 0x0002 /* entry should exist */
+#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
+#define ZXATTR 0x0008 /* we want the xattr dir */
+#define ZRENAMING 0x0010 /* znode is being renamed */
+
+/* mknode flags */
+#define IS_ROOT_NODE 0x01 /* create a root node */
+#define IS_XATTR 0x02 /* create an extended attribute node */
+#define IS_REPLAY 0x04 /* we are replaying intent log */
+
+extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
+ int);
+extern void zfs_dirent_unlock(zfs_dirlock_t *);
+extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+ boolean_t *);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **);
+extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
+ dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
+extern void zfs_rmnode(znode_t *);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
+extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_DIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 000000000000..d28729b0a221
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_IOCTL_H
+#define _SYS_ZFS_IOCTL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Property values for snapdir
+ */
+#define ZFS_SNAPDIR_HIDDEN 0
+#define ZFS_SNAPDIR_VISIBLE 1
+
+#define DMU_BACKUP_VERSION (1ULL)
+#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+/*
+ * zfs ioctl command structure
+ */
+typedef struct dmu_replay_record {
+ enum {
+ DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+ DRR_WRITE, DRR_FREE, DRR_END,
+ } drr_type;
+ uint32_t drr_pad;
+ union {
+ struct drr_begin {
+ uint64_t drr_magic;
+ uint64_t drr_version;
+ uint64_t drr_creation_time;
+ dmu_objset_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_toguid;
+ uint64_t drr_fromguid;
+ char drr_toname[MAXNAMELEN];
+ } drr_begin;
+ struct drr_end {
+ zio_cksum_t drr_checksum;
+ } drr_end;
+ struct drr_object {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ dmu_object_type_t drr_bonustype;
+ uint32_t drr_blksz;
+ uint32_t drr_bonuslen;
+ uint8_t drr_checksum;
+ uint8_t drr_compress;
+ uint8_t drr_pad[6];
+ /* bonus content follows */
+ } drr_object;
+ struct drr_freeobjects {
+ uint64_t drr_firstobj;
+ uint64_t drr_numobjs;
+ } drr_freeobjects;
+ struct drr_write {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ /* content follows */
+ } drr_write;
+ struct drr_free {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ } drr_free;
+ } drr_u;
+} dmu_replay_record_t;
+
+typedef struct zinject_record {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+} zinject_record_t;
+
+#define ZINJECT_NULL 0x1
+#define ZINJECT_FLUSH_ARC 0x2
+#define ZINJECT_UNLOAD_SPA 0x4
+
+typedef struct zfs_cmd {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_cred;
+ uint64_t zc_dev;
+ uint64_t zc_objset_type;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_t zc_inject_record;
+} zfs_cmd_t;
+
+#ifdef _KERNEL
+typedef struct zfs_create_data {
+ cred_t *zc_cred;
+ dev_t zc_dev;
+ nvlist_t *zc_props;
+} zfs_create_data_t;
+#endif
+
+#define ZVOL_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
+
+#ifdef _KERNEL
+
+extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
+extern int zfs_busy(void);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
new file mode 100644
index 000000000000..f302b663e22a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_RLOCK_H
+#define _SYS_FS_ZFS_RLOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <sys/zfs_znode.h>
+
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
+typedef struct rl {
+ znode_t *r_zp; /* znode this lock applies to */
+ avl_node_t r_node; /* avl node link */
+ uint64_t r_off; /* file range offset */
+ uint64_t r_len; /* file range length */
+ uint_t r_cnt; /* range reference count in tree */
+ rl_type_t r_type; /* range type */
+ kcondvar_t r_wr_cv; /* cv for waiting writers */
+ kcondvar_t r_rd_cv; /* cv for waiting readers */
+ uint8_t r_proxy; /* acting for original range */
+ uint8_t r_write_wanted; /* writer wants to lock this range */
+ uint8_t r_read_wanted; /* reader wants to lock this range */
+} rl_t;
+
+/*
+ * Lock a range (offset, length) as either shared (READER)
+ * or exclusive (WRITER or APPEND). APPEND is a special type that
+ * is converted to WRITER that specified to lock from the start of the
+ * end of file. zfs_range_lock() returns the range lock structure.
+ */
+rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void zfs_range_unlock(rl_t *rl);
+
+/*
+ * Reduce range locked as RW_WRITER from whole file to specified range.
+ * Asserts the whole file was previously locked.
+ */
+void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
+
+/*
+ * AVL comparison function used to compare range locks
+ */
+int zfs_range_compare(const void *arg1, const void *arg2);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_RLOCK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 000000000000..aa82cc178091
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_VFSOPS_H
+#define _SYS_FS_ZFS_VFSOPS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfsvfs zfsvfs_t;
+
+struct zfsvfs {
+ vfs_t *z_vfs; /* generic fs struct */
+ zfsvfs_t *z_parent; /* parent fs */
+ objset_t *z_os; /* objset reference */
+ uint64_t z_root; /* id of root znode */
+ uint64_t z_unlinkedobj; /* id of unlinked zapobj */
+ uint64_t z_max_blksz; /* maximum block size for files */
+ uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */
+ zilog_t *z_log; /* intent log pointer */
+ uint_t z_acl_mode; /* acl chmod/mode behavior */
+ uint_t z_acl_inherit; /* acl inheritance behavior */
+ boolean_t z_atime; /* enable atimes mount option */
+ boolean_t z_unmounted1; /* unmounted phase 1 */
+ boolean_t z_unmounted2; /* unmounted phase 2 */
+ uint32_t z_op_cnt; /* vnode/vfs operations ref count */
+ krwlock_t z_um_lock; /* rw lock for umount phase 2 */
+ list_t z_all_znodes; /* all vnodes in the fs */
+ kmutex_t z_znodes_lock; /* lock for z_all_znodes */
+ vnode_t *z_ctldir; /* .zfs directory pointer */
+ boolean_t z_show_ctldir; /* expose .zfs in the root dir */
+ boolean_t z_issnap; /* true if this is a snapshot */
+#define ZFS_OBJ_MTX_SZ 64
+ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
+};
+
+/*
+ * The total file ID size is limited to 12 bytes (including the length
+ * field) in the NFSv2 protocol. For historical reasons, this same limit
+ * is currently being imposed by the Solaris NFSv3 implementation...
+ * although the protocol actually permits a maximum of 64 bytes. It will
+ * not be possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
+ *
+ * For the time being, we will partition up the available space as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+ uint16_t zf_len;
+ uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+typedef struct zfid_long {
+ zfid_short_t z_fid;
+ uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
+#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 000000000000..c9c317e4c4ee
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_ZNODE_H
+#define _SYS_FS_ZFS_ZNODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * special attributes for master node.
+ */
+
+#define ZFS_FSID "FSID"
+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZPL_VERSION_OBJ "VERSION"
+#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
+#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
+
+#define ZFS_FLAG_BLOCKPERPAGE 0x1
+#define ZFS_FLAG_NOGROWBLOCKS 0x2
+
+/*
+ * ZPL version - rev'd whenever an incompatible on-disk format change
+ * occurs. Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define ZPL_VERSION 1ULL
+
+#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
+
+/* Path component length */
+/*
+ * The generic fs code uses MAXNAMELEN to represent
+ * what the largest component length is. Unfortunately,
+ * this length includes the terminating NULL. ZFS needs
+ * to tell the users via pathconf() and statvfs() what the
+ * true maximum length of a component is, excluding the NULL.
+ */
+#define ZFS_MAXNAMELEN (MAXNAMELEN - 1)
+
+/*
+ * The directory entry has the type (currently unused on Solaris) in the
+ * top 4 bits, and the object number in the low 48 bits. The "middle"
+ * 12 bits are unused.
+ */
+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
+
+
+/*
+ * This is the persistent portion of the znode. It is stored
+ * in the "bonus buffer" of the file. Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_pad[4]; /* 144 - future */
+ zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we only use this space to store symbolic links.
+ */
+} znode_phys_t;
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+ char *dl_name; /* directory entry being locked */
+ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
+ uint16_t dl_namesize; /* set if dl_name was allocated */
+ kcondvar_t dl_cv; /* wait for entry to be unlocked */
+ struct znode *dl_dzp; /* directory znode */
+ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+typedef struct znode {
+ struct zfsvfs *z_zfsvfs;
+ vnode_t *z_vnode;
+ uint64_t z_id; /* object ID for this znode */
+ kmutex_t z_lock; /* znode modification lock */
+ krwlock_t z_map_lock; /* page map lock */
+ krwlock_t z_parent_lock; /* parent lock for directories */
+ krwlock_t z_name_lock; /* "master" lock for dirent locks */
+ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+ kmutex_t z_range_lock; /* protects changes to z_range_avl */
+ avl_tree_t z_range_avl; /* avl tree of file range locks */
+ uint8_t z_unlinked; /* file has been unlinked */
+ uint8_t z_atime_dirty; /* atime needs to be synced */
+ uint8_t z_dbuf_held; /* Is z_dbuf already held? */
+ uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint_t z_blksz; /* block size in bytes */
+ uint_t z_seq; /* modification sequence number */
+ uint64_t z_mapcnt; /* number of pages mapped to file */
+ uint64_t z_last_itx; /* last ZIL itx on this znode */
+ uint32_t z_sync_cnt; /* synchronous open count */
+ kmutex_t z_acl_lock; /* acl data lock */
+ list_node_t z_link_node; /* all znodes in fs link */
+ struct lockf *z_lockf; /* Head of byte-level lock list. */
+ /*
+ * These are dmu managed fields.
+ */
+ znode_phys_t *z_phys; /* pointer to persistent znode */
+ dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
+} znode_t;
+
+
+/*
+ * Range locking rules
+ * --------------------
+ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
+ * file range needs to be locked as RL_WRITER. Only then can the pages be
+ * freed etc and zp_size reset. zp_size must be set within range lock.
+ * 2. For writes and punching holes (zfs_write & zfs_space) just the range
+ * being written or freed needs to be locked as RL_WRITER.
+ * Multiple writes at the end of the file must coordinate zp_size updates
+ * to ensure data isn't lost. A compare and swap loop is currently used
+ * to ensure the file size is at least the offset last written.
+ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
+ * read needs to be locked as RL_READER. A check against zp_size can then
+ * be made for reading beyond end of file.
+ */
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#define ZTOV(ZP) ((ZP)->z_vnode)
+#define VTOZ(VP) ((znode_t *)(VP)->v_data)
+
+/*
+ * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
+ * ZFS_EXIT() must be called before exitting the vop.
+ */
+#define ZFS_ENTER(zfsvfs) \
+ { \
+ atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
+ if ((zfsvfs)->z_unmounted1) { \
+ ZFS_EXIT(zfsvfs); \
+ return (EIO); \
+ } \
+ }
+#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1))
+#define ZFS_OBJ_MUTEX(zp) \
+ (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
+#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+ mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
+
+#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+ mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+
+/*
+ * Macros to encode/decode ZFS stored time values from/to struct timespec
+ */
+#define ZFS_TIME_ENCODE(tp, stmp) \
+{ \
+ stmp[0] = (uint64_t)(tp)->tv_sec; \
+ stmp[1] = (uint64_t)(tp)->tv_nsec; \
+}
+
+#define ZFS_TIME_DECODE(tp, stmp) \
+{ \
+ (tp)->tv_sec = (time_t)stmp[0]; \
+ (tp)->tv_nsec = (long)stmp[1]; \
+}
+
+/*
+ * Timestamp defines
+ */
+#define ACCESSED (AT_ATIME)
+#define STATE_CHANGED (AT_CTIME)
+#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
+
+#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+ zfs_time_stamper(zp, ACCESSED, NULL)
+
+extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern void zfs_set_dataprop(objset_t *);
+extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
+extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
+extern void zfs_znode_init(void);
+extern void zfs_znode_fini(void);
+extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern void zfs_zinactive(znode_t *);
+extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void zfs_znode_free(znode_t *);
+extern void zfs_remove_op_tables();
+extern int zfs_create_op_tables();
+extern dev_t zfs_cmpldev(uint64_t);
+
+extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, char *name);
+extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link);
+extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag);
+extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len);
+extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied);
+#ifndef ZFS_NO_ACL
+extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, int aclcnt, ace_t *z_ace);
+#endif
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+#endif /* _KERNEL */
+
+extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZNODE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 000000000000..947ba9fa6076
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,276 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIL_H
+#define _SYS_ZIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log. The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset. The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t). The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_pad[5];
+} zil_header_t;
+
+/*
+ * Log block trailer - structure at the end of the header and each log block
+ *
+ * The zit_bt contains a zbt_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zbt_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_trailer {
+ uint64_t zit_pad;
+ blkptr_t zit_next_blk; /* next block in chain */
+ uint64_t zit_nused; /* bytes in log block used */
+ zio_block_tail_t zit_bt; /* block trailer */
+} zil_trailer_t;
+
+#define ZIL_MIN_BLKSZ 4096ULL
+#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
+#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
+
+/*
+ * The words of a log block checksum.
+ */
+#define ZIL_ZC_GUID_0 0
+#define ZIL_ZC_GUID_1 1
+#define ZIL_ZC_OBJSET 2
+#define ZIL_ZC_SEQ 3
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define TX_CREATE 1 /* Create file */
+#define TX_MKDIR 2 /* Make directory */
+#define TX_MKXATTR 3 /* Make XATTR directory */
+#define TX_SYMLINK 4 /* Create symbolic link to a file */
+#define TX_REMOVE 5 /* Remove file */
+#define TX_RMDIR 6 /* Remove directory */
+#define TX_LINK 7 /* Create hard link to a file */
+#define TX_RENAME 8 /* Rename a file */
+#define TX_WRITE 9 /* File write */
+#define TX_TRUNCATE 10 /* Truncate a file */
+#define TX_SETATTR 11 /* Set file attributes */
+#define TX_ACL 12 /* Set acl */
+#define TX_MAX_TYPE 13 /* Max transaction type */
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ *
+ * Note, lrc_seq holds two different sequence numbers. Whilst in memory
+ * it contains the transaction sequence number. The log record on
+ * disk holds the sequence number of all log records which is used to
+ * ensure we don't replay the same record. The two sequence numbers are
+ * different because the transactions can now be pushed out of order.
+ */
+typedef struct { /* common log record header */
+ uint64_t lrc_txtype; /* intent log transaction type */
+ uint64_t lrc_reclen; /* transaction record length */
+ uint64_t lrc_txg; /* dmu transaction group number */
+ uint64_t lrc_seq; /* see comment above */
+} lr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* object id of directory */
+ uint64_t lr_foid; /* object id of created file object */
+ uint64_t lr_mode; /* mode of object */
+ uint64_t lr_uid; /* uid of object */
+ uint64_t lr_gid; /* gid of object */
+ uint64_t lr_gen; /* generation (txg of creation) */
+ uint64_t lr_crtime[2]; /* creation time */
+ uint64_t lr_rdev; /* rdev of object to create */
+ /* name of object to create follows this */
+ /* for symlinks, link content follows name */
+} lr_create_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ /* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ uint64_t lr_link_obj; /* obj id of link */
+ /* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_sdoid; /* obj id of source directory */
+ uint64_t lr_tdoid; /* obj id of target directory */
+ /* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to write */
+ uint64_t lr_offset; /* offset to write to */
+ uint64_t lr_length; /* user data length to write */
+ uint64_t lr_blkoff; /* offset represented by lr_blkptr */
+ blkptr_t lr_blkptr; /* spa block pointer for replay */
+ /* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id of file to truncate */
+ uint64_t lr_offset; /* offset to truncate from */
+ uint64_t lr_length; /* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to change attributes */
+ uint64_t lr_mask; /* mask of attributes to set */
+ uint64_t lr_mode; /* mode to set */
+ uint64_t lr_uid; /* uid to set */
+ uint64_t lr_gid; /* gid to set */
+ uint64_t lr_size; /* size to set */
+ uint64_t lr_atime[2]; /* access time */
+ uint64_t lr_mtime[2]; /* modification time */
+} lr_setattr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of acl entries */
+ /* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * ZFS intent log transaction structure
+ */
+typedef enum {
+ WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
+ /* and put blkptr in log, rather than actual data) */
+ WR_COPIED, /* immediate - data is copied into lr_write_t */
+ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+} itx_wr_state_t;
+
+typedef struct itx {
+ list_node_t itx_node; /* linkage on zl_itx_list */
+ void *itx_private; /* type-specific opaque data */
+ itx_wr_state_t itx_wr_state; /* write state */
+ uint8_t itx_sync; /* synchronous transaction */
+ lr_t itx_lr; /* common part of log record */
+ /* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+
+/*
+ * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
+ * to handle the cleanup of the dmu_sync() buffer write
+ */
+typedef struct {
+ zilog_t *zgd_zilog; /* zilog */
+ blkptr_t *zgd_bp; /* block pointer */
+ struct rl *zgd_rl; /* range lock */
+} zgd_t;
+
+
+typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+ uint64_t txg);
+typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+ uint64_t txg);
+typedef int zil_replay_func_t();
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
+
+extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void zil_init(void);
+extern void zil_fini(void);
+
+extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void zil_free(zilog_t *zilog);
+
+extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void zil_close(zilog_t *zilog);
+
+extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
+
+extern itx_t *zil_itx_create(int txtype, size_t lrsize);
+extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+
+extern int zil_claim(char *osname, void *txarg);
+extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_clean(zilog_t *zilog);
+extern int zil_is_committed(zilog_t *zilog);
+
+extern int zil_suspend(zilog_t *zilog);
+extern void zil_resume(zilog_t *zilog);
+
+extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev);
+
+extern int zil_disable;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 000000000000..3ecf4e4debf5
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIL_IMPL_H
+#define _SYS_ZIL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Log write buffer.
+ */
+typedef struct lwb {
+ zilog_t *lwb_zilog; /* back pointer to log struct */
+ blkptr_t lwb_blk; /* on disk address of this log blk */
+ int lwb_nused; /* # used bytes in buffer */
+ int lwb_sz; /* size of block and buffer */
+ char *lwb_buf; /* log write buffer */
+ zio_t *lwb_zio; /* zio for this buffer */
+ uint64_t lwb_max_txg; /* highest txg in this lwb */
+ txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
+ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
+} lwb_t;
+
+/*
+ * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
+ * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
+ */
+
+#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
+typedef struct zil_vdev {
+ uint64_t vdev; /* device written */
+ list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */
+} zil_vdev_t;
+
+/*
+ * Stable storage intent log management structure. One per dataset.
+ */
+struct zilog {
+ kmutex_t zl_lock; /* protects most zilog_t fields */
+ struct dsl_pool *zl_dmu_pool; /* DSL pool */
+ spa_t *zl_spa; /* handle for read/write log */
+ const zil_header_t *zl_header; /* log header buffer */
+ objset_t *zl_os; /* object set we're logging */
+ zil_get_data_t *zl_get_data; /* callback to get object content */
+ zio_t *zl_root_zio; /* log writer root zio */
+ uint64_t zl_itx_seq; /* next itx sequence number */
+ uint64_t zl_commit_seq; /* committed upto this number */
+ uint64_t zl_lr_seq; /* log record sequence number */
+ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
+ uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+ uint32_t zl_suspend; /* log suspend count */
+ kcondvar_t zl_cv_writer; /* log writer thread completion */
+ kcondvar_t zl_cv_suspend; /* log suspend completion */
+ uint8_t zl_suspending; /* log is currently suspending */
+ uint8_t zl_keep_first; /* keep first log block in destroy */
+ uint8_t zl_stop_replay; /* don't replay any further */
+ uint8_t zl_stop_sync; /* for debugging */
+ uint8_t zl_writer; /* boolean: write setup in progress */
+ uint8_t zl_log_error; /* boolean: log write error */
+ list_t zl_itx_list; /* in-memory itx list */
+ uint64_t zl_itx_list_sz; /* total size of records on list */
+ uint64_t zl_cur_used; /* current commit log size used */
+ uint64_t zl_prev_used; /* previous commit log size used */
+ list_t zl_lwb_list; /* in-flight log write list */
+ list_t zl_vdev_list; /* list of [vdev, seq] pairs */
+ uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
+ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
+ avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ clock_t zl_replay_time; /* lbolt of when replay started */
+ uint64_t zl_replay_blks; /* number of log blocks replayed */
+};
+
+typedef struct zil_dva_node {
+ dva_t zn_dva;
+ avl_node_t zn_node;
+} zil_dva_node_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 000000000000..b026ae6450c6
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,366 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_H
+#define _ZIO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dkio.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+
+typedef struct zio_block_tail {
+ uint64_t zbt_magic; /* for validation, endianness */
+ zio_cksum_t zbt_cksum; /* 256-bit checksum */
+} zio_block_tail_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_block_tail_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+#define ZIO_GET_IOSIZE(zio) \
+ (BP_IS_GANG((zio)->io_bp) ? \
+ SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_block_tail_t zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+#define ZIO_PRIORITY_NOW (zio_priority_table[0])
+#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
+#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
+#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
+#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
+#define ZIO_PRIORITY_FREE (zio_priority_table[5])
+#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
+#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
+#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
+#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
+#define ZIO_PRIORITY_TABLE_SIZE 10
+
+#define ZIO_FLAG_MUSTSUCCEED 0x00000
+#define ZIO_FLAG_CANFAIL 0x00001
+#define ZIO_FLAG_FAILFAST 0x00002
+#define ZIO_FLAG_CONFIG_HELD 0x00004
+#define ZIO_FLAG_CONFIG_GRABBED 0x00008
+
+#define ZIO_FLAG_DONT_CACHE 0x00010
+#define ZIO_FLAG_DONT_QUEUE 0x00020
+#define ZIO_FLAG_DONT_PROPAGATE 0x00040
+#define ZIO_FLAG_DONT_RETRY 0x00080
+
+#define ZIO_FLAG_PHYSICAL 0x00100
+#define ZIO_FLAG_IO_BYPASS 0x00200
+#define ZIO_FLAG_IO_REPAIR 0x00400
+#define ZIO_FLAG_SPECULATIVE 0x00800
+
+#define ZIO_FLAG_RESILVER 0x01000
+#define ZIO_FLAG_SCRUB 0x02000
+#define ZIO_FLAG_SCRUB_THREAD 0x04000
+#define ZIO_FLAG_SUBBLOCK 0x08000
+
+#define ZIO_FLAG_NOBOOKMARK 0x10000
+#define ZIO_FLAG_USER 0x20000
+
+#define ZIO_FLAG_METADATA 0x40000
+
+#define ZIO_FLAG_GANG_INHERIT \
+ (ZIO_FLAG_CANFAIL | \
+ ZIO_FLAG_FAILFAST | \
+ ZIO_FLAG_CONFIG_HELD | \
+ ZIO_FLAG_DONT_RETRY | \
+ ZIO_FLAG_IO_REPAIR | \
+ ZIO_FLAG_SPECULATIVE | \
+ ZIO_FLAG_RESILVER | \
+ ZIO_FLAG_SCRUB | \
+ ZIO_FLAG_SCRUB_THREAD)
+
+#define ZIO_FLAG_VDEV_INHERIT \
+ (ZIO_FLAG_GANG_INHERIT | \
+ ZIO_FLAG_DONT_CACHE | \
+ ZIO_FLAG_PHYSICAL)
+
+/*
+ * We'll take the EILSEQ (Illegal byte sequence) errno
+ * to indicate checksum errors.
+ */
+#define ECKSUM EILSEQ
+
+typedef struct zio zio_t;
+typedef void zio_done_func_t(zio_t *zio);
+
+extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
+extern char *zio_type_name[ZIO_TYPES];
+
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool. By convention, the meta-objset (MOS)
+ * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
+ * level -1 of the meta-dnode, and intent log blocks (which are chained
+ * off the root block) have blkid == sequence number. In summary:
+ *
+ * mos is objset 0
+ * meta-dnode is object 0
+ * root block is <objset, 0, -1, 0>
+ * intent log is <objset, 0, -1, ZIL sequence number>
+ *
+ * Note: this structure is called a bookmark because its first purpose was
+ * to remember where to resume a pool-wide traverse. The absolute ordering
+ * for block visitation during traversal is defined in compare_bookmark().
+ *
+ * Note: this structure is passed between userland and the kernel.
+ * Therefore it must not change size or alignment between 32/64 bit
+ * compilation options.
+ */
+typedef struct zbookmark {
+ uint64_t zb_objset;
+ uint64_t zb_object;
+ int64_t zb_level;
+ uint64_t zb_blkid;
+} zbookmark_t;
+
+struct zio {
+ /* Core information about this I/O */
+ zio_t *io_parent;
+ zio_t *io_root;
+ spa_t *io_spa;
+ zbookmark_t io_bookmark;
+ enum zio_checksum io_checksum;
+ enum zio_compress io_compress;
+ int io_ndvas;
+ uint64_t io_txg;
+ blkptr_t *io_bp;
+ blkptr_t io_bp_copy;
+ zio_t *io_child;
+ zio_t *io_sibling_prev;
+ zio_t *io_sibling_next;
+ zio_transform_t *io_transform_stack;
+ zio_t *io_logical;
+
+ /* Callback info */
+ zio_done_func_t *io_ready;
+ zio_done_func_t *io_done;
+ void *io_private;
+ blkptr_t io_bp_orig;
+
+ /* Data represented by this I/O */
+ void *io_data;
+ uint64_t io_size;
+
+ /* Stuff for the vdev stack */
+ vdev_t *io_vd;
+ void *io_vsd;
+ uint64_t io_offset;
+ uint64_t io_deadline;
+ uint64_t io_timestamp;
+ avl_node_t io_offset_node;
+ avl_node_t io_deadline_node;
+ avl_tree_t *io_vdev_tree;
+ zio_t *io_delegate_list;
+ zio_t *io_delegate_next;
+
+ /* Internal pipeline state */
+ int io_flags;
+ enum zio_type io_type;
+ enum zio_stage io_stage;
+ uint8_t io_stalled;
+ uint8_t io_priority;
+ struct dk_callback io_dk_callback;
+ int io_cmd;
+ int io_retries;
+ int io_error;
+ uint32_t io_numerrors;
+ uint32_t io_pipeline;
+ uint32_t io_async_stages;
+ uint64_t io_children_notready;
+ uint64_t io_children_notdone;
+ void *io_waiter;
+ kmutex_t io_lock;
+ kcondvar_t io_cv;
+
+ /* FMA state */
+ uint64_t io_ena;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+ zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_root(spa_t *spa,
+ zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags, zbookmark_t *zb);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb);
+
+extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t txg);
+extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+extern void zio_next_stage(zio_t *zio);
+extern void zio_next_stage_async(zio_t *zio);
+extern void zio_wait_children_done(zio_t *zio);
+
+/*
+ * Delegate I/O to a child vdev.
+ */
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+ uint64_t offset, void *data, uint64_t size, int type, int priority,
+ int flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
+
+extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
+extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+
+boolean_t zio_should_retry(zio_t *zio);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+ struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+ struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, int error);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 000000000000..bb7bd41e0bb3
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define _SYS_ZIO_CHECKSUM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+ zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
+ int ci_correctable; /* number of correctable bits */
+ int ci_zbt; /* uses zio block tail? */
+ char *ci_name; /* descriptive name */
+} zio_checksum_info_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t fletcher_2_native;
+extern zio_checksum_t fletcher_4_native;
+extern zio_checksum_t fletcher_4_incremental_native;
+
+extern zio_checksum_t fletcher_2_byteswap;
+extern zio_checksum_t fletcher_4_byteswap;
+extern zio_checksum_t fletcher_4_incremental_byteswap;
+
+extern zio_checksum_t zio_checksum_SHA256;
+
+extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
+ void *data, uint64_t size);
+extern int zio_checksum_error(zio_t *zio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_CHECKSUM_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 000000000000..66ee8d45b3b6
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define _SYS_ZIO_COMPRESS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+typedef int zio_decompress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+ zio_compress_func_t *ci_compress; /* compression function */
+ zio_decompress_func_t *ci_decompress; /* decompression function */
+ int ci_level; /* level parameter */
+ char *ci_name; /* algorithm name */
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
+ void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
+extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_COMPRESS_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 000000000000..d2ddbc34e922
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,205 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define _ZIO_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * I/O Groups: pipeline stage definitions.
+ */
+
+typedef enum zio_stage {
+ ZIO_STAGE_OPEN = 0, /* RWFCI */
+ ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */
+
+ ZIO_STAGE_WRITE_COMPRESS, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
+
+ ZIO_STAGE_GANG_PIPELINE, /* -WFC- */
+
+ ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */
+ ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */
+ ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */
+ ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */
+
+ ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
+ ZIO_STAGE_DVA_FREE, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM, /* ---C- */
+
+ ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */
+
+ ZIO_STAGE_READY, /* RWFCI */
+
+ ZIO_STAGE_VDEV_IO_START, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
+
+ ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */
+
+ ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
+ ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
+ ZIO_STAGE_READ_DECOMPRESS, /* R---- */
+
+ ZIO_STAGE_DONE /* RWFCI */
+} zio_stage_t;
+
+/*
+ * The stages for which there's some performance value in going async.
+ * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
+ */
+#define ZIO_ASYNC_PIPELINE_STAGES \
+ ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_VDEV_IO_DONE) | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_READ_DECOMPRESS))
+
+#define ZIO_VDEV_IO_PIPELINE \
+ ((1U << ZIO_STAGE_VDEV_IO_START) | \
+ (1U << ZIO_STAGE_VDEV_IO_DONE) | \
+ (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+
+#define ZIO_READ_PHYS_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_READ_PIPELINE \
+ ZIO_READ_PHYS_PIPELINE
+
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WRITE_COMMON_PIPELINE \
+ ZIO_WRITE_PHYS_PIPELINE
+
+#define ZIO_WRITE_PIPELINE \
+ ((1U << ZIO_STAGE_WRITE_COMPRESS) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_GANG_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_READ_GANG_MEMBERS))
+
+#define ZIO_REWRITE_PIPELINE \
+ ((1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_WRITE_ALLOCATE_PIPELINE \
+ ((1U << ZIO_STAGE_DVA_ALLOCATE) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_GANG_FREE_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS))
+
+#define ZIO_FREE_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_DVA_FREE) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_CLAIM_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_DVA_CLAIM) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_IOCTL_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
+ ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
+ ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \
+ ZIO_VDEV_IO_PIPELINE)
+
+#define ZIO_ERROR_PIPELINE_MASK \
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE
+
+typedef struct zio_transform zio_transform_t;
+struct zio_transform {
+ void *zt_data;
+ uint64_t zt_size;
+ uint64_t zt_bufsize;
+ zio_transform_t *zt_next;
+};
+
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
new file mode 100644
index 000000000000..df85824d59bd
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
@@ -0,0 +1,68 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZVOL_H
+#define _SYS_ZVOL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
+extern int zvol_check_volblocksize(uint64_t volblocksize);
+extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
+extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
+extern int zvol_create_minor(const char *, dev_t);
+extern int zvol_remove_minor(const char *);
+extern int zvol_set_volsize(const char *, dev_t, uint64_t);
+extern int zvol_set_volblocksize(const char *, uint64_t);
+
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+#ifndef __FreeBSD__
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+#endif
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+ int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZVOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c
new file mode 100644
index 000000000000..844beb6864a5
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -0,0 +1,611 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/callb.h>
+
+/*
+ * Pool-wide transaction groups.
+ */
+
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
+static void txg_timelimit_thread(void *arg);
+
+int txg_time = 5; /* max 5 seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c, i;
+ bzero(tx, sizeof (tx_state_t));
+
+ tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+ for (c = 0; c < max_ncpus; c++) {
+ mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+ for (i = 0; i < TXG_SIZE; i++)
+ cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL);
+ }
+
+ rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
+ mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
+ tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c, i;
+
+ ASSERT(tx->tx_threads == 0);
+
+ cv_destroy(&tx->tx_exit_cv);
+ cv_destroy(&tx->tx_timeout_exit_cv);
+ cv_destroy(&tx->tx_quiesce_done_cv);
+ cv_destroy(&tx->tx_quiesce_more_cv);
+ cv_destroy(&tx->tx_sync_done_cv);
+ cv_destroy(&tx->tx_sync_more_cv);
+ rw_destroy(&tx->tx_suspend);
+ mutex_destroy(&tx->tx_sync_lock);
+
+ for (c = 0; c < max_ncpus; c++) {
+ for (i = 0; i < TXG_SIZE; i++)
+ cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ mutex_destroy(&tx->tx_cpu[c].tc_lock);
+ }
+
+ kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+ bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+
+ dprintf("pool %p\n", dp);
+
+ ASSERT(tx->tx_threads == 0);
+
+ tx->tx_threads = 3;
+
+ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+ ASSERT(*tpp != NULL);
+ *tpp = NULL;
+ tx->tx_threads--;
+ cv_broadcast(&tx->tx_exit_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
+ thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
+{
+ CALLB_CPR_SAFE_BEGIN(cpr);
+
+ if (secmax)
+ (void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz);
+ else
+ cv_wait(cv, &tx->tx_sync_lock);
+
+ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ dprintf("pool %p\n", dp);
+ /*
+ * Finish off any work in progress.
+ */
+ ASSERT(tx->tx_threads == 3);
+ txg_wait_synced(dp, 0);
+
+ /*
+ * Wake all 3 sync threads (one per state) and wait for them to die.
+ */
+ mutex_enter(&tx->tx_sync_lock);
+
+ ASSERT(tx->tx_threads == 3);
+
+ tx->tx_exiting = 1;
+
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_timeout_exit_cv);
+
+ while (tx->tx_threads != 0)
+ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+ tx->tx_exiting = 0;
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+ uint64_t txg;
+
+ mutex_enter(&tc->tc_lock);
+
+ txg = tx->tx_open_txg;
+ tc->tc_count[txg & TXG_MASK]++;
+
+ th->th_cpu = tc;
+ th->th_txg = txg;
+
+ return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+
+ mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ ASSERT(tc->tc_count[g] != 0);
+ if (--tc->tc_count[g] == 0)
+ cv_broadcast(&tc->tc_cv[g]);
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = NULL; /* defensive */
+}
+
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int g = txg & TXG_MASK;
+ int c;
+
+ /*
+ * Grab all tx_cpu locks so nobody else can get into this txg.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_enter(&tx->tx_cpu[c].tc_lock);
+
+ ASSERT(txg == tx->tx_open_txg);
+ tx->tx_open_txg++;
+
+ /*
+ * Now that we've incremented tx_open_txg, we can let threads
+ * enter the next transaction group.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_exit(&tx->tx_cpu[c].tc_lock);
+
+ /*
+ * Quiesce the transaction group by waiting for everyone to txg_exit().
+ */
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ mutex_enter(&tc->tc_lock);
+ while (tc->tc_count[g] != 0)
+ cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+ mutex_exit(&tc->tc_lock);
+ }
+}
+
+static void
+txg_sync_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We sync when there's someone waiting on us, or the
+ * quiesce thread has handed off a txg to us.
+ */
+ while (!tx->tx_exiting &&
+ tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+ tx->tx_quiesced_txg == 0) {
+ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
+ }
+
+ /*
+ * Wait until the quiesce thread hands off a txg to us,
+ * prompting it to do so if necessary.
+ */
+ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+
+ /*
+ * Consume the quiesced txg which has been handed off to
+ * us. This may cause the quiescing thread to now be
+ * able to quiesce another txg, so we must signal it.
+ */
+ txg = tx->tx_quiesced_txg;
+ tx->tx_quiesced_txg = 0;
+ tx->tx_syncing_txg = txg;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ rw_exit(&tx->tx_suspend);
+
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ spa_sync(dp->dp_spa, txg);
+ mutex_enter(&tx->tx_sync_lock);
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+ tx->tx_synced_txg = txg;
+ tx->tx_syncing_txg = 0;
+ rw_exit(&tx->tx_suspend);
+ cv_broadcast(&tx->tx_sync_done_cv);
+ }
+}
+
+static void
+txg_quiesce_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We quiesce when there's someone waiting on us.
+ * However, we can only have one txg in "quiescing" or
+ * "quiesced, waiting to sync" state. So we wait until
+ * the "quiesced, waiting to sync" txg has been consumed
+ * by the sync thread.
+ */
+ while (!tx->tx_exiting &&
+ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+ tx->tx_quiesced_txg != 0))
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+ txg = tx->tx_open_txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ txg_quiesce(dp, txg);
+ mutex_enter(&tx->tx_sync_lock);
+
+ /*
+ * Hand this txg off to the sync thread.
+ */
+ dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiesced_txg = txg;
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ }
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 3);
+ if (txg == 0)
+ txg = tx->tx_open_txg;
+ if (tx->tx_sync_txg_waiting < txg)
+ tx->tx_sync_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_synced_txg < txg) {
+ dprintf("broadcasting sync more "
+ "tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 3);
+ if (txg == 0)
+ txg = tx->tx_open_txg + 1;
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_open_txg < txg) {
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_timelimit_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ while (!tx->tx_exiting) {
+ uint64_t txg = tx->tx_open_txg + 1;
+
+ txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
+
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+
+ while (!tx->tx_exiting && tx->tx_open_txg < txg) {
+ dprintf("pushing out %llu\n", txg);
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+ }
+ txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
+}
+
+int
+txg_stalled(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+void
+txg_suspend(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ /* XXX some code paths suspend when they are already suspended! */
+ rw_enter(&tx->tx_suspend, RW_READER);
+}
+
+void
+txg_resume(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ rw_exit(&tx->tx_suspend);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, size_t offset)
+{
+ int t;
+
+ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tl->tl_offset = offset;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(txg_list_empty(tl, t));
+
+ mutex_destroy(&tl->tl_lock);
+}
+
+int
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Add an entry to the list.
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ int already_on_list;
+
+ mutex_enter(&tl->tl_lock);
+ already_on_list = tn->tn_member[t];
+ if (!already_on_list) {
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = tl->tl_head[t];
+ tl->tl_head[t] = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (already_on_list);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+ void *p = NULL;
+
+ mutex_enter(&tl->tl_lock);
+ if ((tn = tl->tl_head[t]) != NULL) {
+ p = (char *)tn - tl->tl_offset;
+ tl->tl_head[t] = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn, **tp;
+
+ mutex_enter(&tl->tl_lock);
+
+ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+ if ((char *)tn - tl->tl_offset == p) {
+ *tp = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ mutex_exit(&tl->tl_lock);
+ return (p);
+ }
+ }
+
+ mutex_exit(&tl->tl_lock);
+
+ return (NULL);
+}
+
+int
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ return (tn->tn_member[t]);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = tl->tl_head[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ tn = tn->tn_next[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
new file mode 100644
index 000000000000..34d7e0c3ac74
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Update the uberblock and return a boolean value indicating whether
+ * anything changed in this transaction group.
+ */
+int
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
+{
+ ASSERT(ub->ub_txg < txg);
+
+ /*
+ * We explicitly do not set ub_version here, so that older versions
+ * continue to be written with the previous uberblock version.
+ */
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_txg = txg;
+ ub->ub_guid_sum = rvd->vdev_guid_sum;
+ ub->ub_timestamp = gethrestime_sec();
+
+ return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c
new file mode 100644
index 000000000000..b52e729d6294
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx; /* Lock never initialized. */
+SX_SYSINIT(unique, &unique_mtx, "unique lock");
+
+typedef struct unique {
+ avl_node_t un_link;
+ uint64_t un_value;
+} unique_t;
+
+#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+ const unique_t *una = a;
+ const unique_t *unb = b;
+
+ if (una->un_value < unb->un_value)
+ return (-1);
+ if (una->un_value > unb->un_value)
+ return (+1);
+ return (0);
+}
+
+void
+unique_init(void)
+{
+ avl_create(&unique_avl, unique_compare,
+ sizeof (unique_t), offsetof(unique_t, un_link));
+}
+
+uint64_t
+unique_create(void)
+{
+ return (unique_insert(0));
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+ avl_index_t idx;
+ unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+ un->un_value = value;
+
+ mutex_enter(&unique_mtx);
+ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+ avl_find(&unique_avl, un, &idx)) {
+ mutex_exit(&unique_mtx);
+ (void) random_get_pseudo_bytes((void*)&un->un_value,
+ sizeof (un->un_value));
+ un->un_value &= UNIQUE_MASK;
+ mutex_enter(&unique_mtx);
+ }
+
+ avl_insert(&unique_avl, un, idx);
+ mutex_exit(&unique_mtx);
+
+ return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+ unique_t un_tofind;
+ unique_t *un;
+
+ un_tofind.un_value = value;
+ mutex_enter(&unique_mtx);
+ un = avl_find(&unique_avl, &un_tofind, NULL);
+ if (un != NULL) {
+ avl_remove(&unique_avl, un);
+ kmem_free(un, sizeof (unique_t));
+ }
+ mutex_exit(&unique_mtx);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c
new file mode 100644
index 000000000000..0fceb8d5bda3
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -0,0 +1,1905 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+ &vdev_root_ops,
+ &vdev_raidz_ops,
+ &vdev_mirror_ops,
+ &vdev_replacing_ops,
+ &vdev_spare_ops,
+#ifdef _KERNEL
+ &vdev_geom_ops,
+#else
+ &vdev_disk_ops,
+ &vdev_file_ops,
+#endif
+ &vdev_missing_ops,
+ NULL
+};
+
+/* maximum scrub/resilver I/O queue */
+int zfs_scrub_limit = 70;
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+ vdev_ops_t *ops, **opspp;
+
+ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+ if (strcmp(ops->vdev_op_type, type) == 0)
+ break;
+
+ return (ops);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children. This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
+ uint64_t csize;
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ asize = MAX(asize, csize);
+ }
+
+ return (asize);
+}
+
+/*
+ * Get the replaceable or attachable device size.
+ * If the parent is a mirror or raidz, the replaceable size is the minimum
+ * psize of all its children. For the rest, just return our own psize.
+ *
+ * e.g.
+ * psize rsize
+ * root - -
+ * mirror/raidz - -
+ * disk1 20g 20g
+ * disk2 40g 20g
+ * disk3 80g 80g
+ */
+uint64_t
+vdev_get_rsize(vdev_t *vd)
+{
+ vdev_t *pvd, *cvd;
+ uint64_t c, rsize;
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If our parent is NULL or the root, just return our own psize.
+ */
+ if (pvd == NULL || pvd->vdev_parent == NULL)
+ return (vd->vdev_psize);
+
+ rsize = 0;
+
+ for (c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+ rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
+ }
+
+ return (rsize);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (vdev < rvd->vdev_children)
+ return (rvd->vdev_child[vdev]);
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+ int c;
+ vdev_t *mvd;
+
+ if (vd->vdev_guid == guid)
+ return (vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+ size_t oldsize, newsize;
+ uint64_t id = cvd->vdev_id;
+ vdev_t **newchild;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+ ASSERT(cvd->vdev_parent == NULL);
+
+ cvd->vdev_parent = pvd;
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+ oldsize = pvd->vdev_children * sizeof (vdev_t *);
+ pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+ newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+ newchild = kmem_zalloc(newsize, KM_SLEEP);
+ if (pvd->vdev_child != NULL) {
+ bcopy(pvd->vdev_child, newchild, oldsize);
+ kmem_free(pvd->vdev_child, oldsize);
+ }
+
+ pvd->vdev_child = newchild;
+ pvd->vdev_child[id] = cvd;
+
+ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf)
+ cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+ int c;
+ uint_t id = cvd->vdev_id;
+
+ ASSERT(cvd->vdev_parent == pvd);
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id < pvd->vdev_children);
+ ASSERT(pvd->vdev_child[id] == cvd);
+
+ pvd->vdev_child[id] = NULL;
+ cvd->vdev_parent = NULL;
+
+ for (c = 0; c < pvd->vdev_children; c++)
+ if (pvd->vdev_child[c])
+ break;
+
+ if (c == pvd->vdev_children) {
+ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+ pvd->vdev_child = NULL;
+ pvd->vdev_children = 0;
+ }
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf)
+ cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+ vdev_t **newchild, *cvd;
+ int oldc = pvd->vdev_children;
+ int newc, c;
+
+ ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+
+ for (c = newc = 0; c < oldc; c++)
+ if (pvd->vdev_child[c])
+ newc++;
+
+ newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+ for (c = newc = 0; c < oldc; c++) {
+ if ((cvd = pvd->vdev_child[c]) != NULL) {
+ newchild[newc] = cvd;
+ cvd->vdev_id = newc++;
+ }
+ }
+
+ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+ pvd->vdev_child = newchild;
+ pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+static vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+ vdev_t *vd;
+
+ vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+ if (spa->spa_root_vdev == NULL) {
+ ASSERT(ops == &vdev_root_ops);
+ spa->spa_root_vdev = vd;
+ }
+
+ if (guid == 0) {
+ if (spa->spa_root_vdev == vd) {
+ /*
+ * The root vdev's guid will also be the pool guid,
+ * which must be unique among all pools.
+ */
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
+ } else {
+ /*
+ * Any other vdev's guid must be unique within the pool.
+ */
+ while (guid == 0 ||
+ spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ }
+ ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+ }
+
+ vd->vdev_spa = spa;
+ vd->vdev_id = id;
+ vd->vdev_guid = guid;
+ vd->vdev_guid_sum = guid;
+ vd->vdev_ops = ops;
+ vd->vdev_state = VDEV_STATE_CLOSED;
+
+ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+ space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ txg_list_create(&vd->vdev_ms_list,
+ offsetof(struct metaslab, ms_txg_node));
+ txg_list_create(&vd->vdev_dtl_list,
+ offsetof(struct vdev, vdev_dtl_node));
+ vd->vdev_stat.vs_timestamp = gethrtime();
+
+ return (vd);
+}
+
+/*
+ * Free a vdev_t that has been removed from service.
+ */
+static void
+vdev_free_common(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+
+ if (vd->vdev_isspare)
+ spa_spare_remove(vd);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_unload(&vd->vdev_dtl_map);
+ space_map_destroy(&vd->vdev_dtl_map);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ space_map_destroy(&vd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_stat_lock);
+
+ if (vd == spa->spa_root_vdev)
+ spa->spa_root_vdev = NULL;
+
+ kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Allocate a new vdev. The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+int
+vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
+ int alloctype)
+{
+ vdev_ops_t *ops;
+ char *type;
+ uint64_t guid = 0;
+ vdev_t *vd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (EINVAL);
+
+ if ((ops = vdev_getops(type)) == NULL)
+ return (EINVAL);
+
+ /*
+ * If this is a load, get the vdev guid from the nvlist.
+ * Otherwise, vdev_alloc_common() will generate one for us.
+ */
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ uint64_t label_id;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+ label_id != id)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_SPARE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ }
+
+ /*
+ * The first allocated vdev must be of type 'root'.
+ */
+ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
+ return (EINVAL);
+
+ vd = vdev_alloc_common(spa, id, guid, ops);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+ vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+ vd->vdev_devid = spa_strdup(vd->vdev_devid);
+
+ /*
+ * Set the nparity propery for RAID-Z vdevs.
+ */
+ if (ops == &vdev_raidz_ops) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &vd->vdev_nparity) == 0) {
+ /*
+ * Currently, we can only support 2 parity devices.
+ */
+ if (vd->vdev_nparity > 2)
+ return (EINVAL);
+ /*
+ * Older versions can only support 1 parity device.
+ */
+ if (vd->vdev_nparity == 2 &&
+ spa_version(spa) < ZFS_VERSION_RAID6)
+ return (ENOTSUP);
+
+ } else {
+ /*
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
+ */
+ if (spa_version(spa) >= ZFS_VERSION_RAID6)
+ return (EINVAL);
+
+ /*
+ * Otherwise, we default to 1 parity device for RAID-Z.
+ */
+ vd->vdev_nparity = 1;
+ }
+ } else {
+ vd->vdev_nparity = 0;
+ }
+
+ /*
+ * Set the whole_disk property. If it's not specified, leave the value
+ * as -1.
+ */
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &vd->vdev_wholedisk) != 0)
+ vd->vdev_wholedisk = -1ULL;
+
+ /*
+ * Look for the 'not present' flag. This will only be set if the device
+ * was not present at the time of import.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
+
+ /*
+ * Get the alignment requirement.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+ /*
+ * If we're a top-level vdev, try to load the allocation parameters.
+ */
+ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ &vd->vdev_ms_array);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ &vd->vdev_ms_shift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ &vd->vdev_asize);
+ }
+
+ /*
+ * If we're a leaf vdev, try to load the DTL object and offline state.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+ &vd->vdev_dtl.smo_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ &vd->vdev_offline);
+ }
+
+ /*
+ * Add ourselves to the parent's list of children.
+ */
+ vdev_add_child(parent, vd);
+
+ *vdp = vd;
+
+ return (0);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+ int c;
+
+ /*
+ * vdev_free() implies closing the vdev first. This is simpler than
+ * trying to ensure complicated semantics for all callers.
+ */
+ vdev_close(vd);
+
+ ASSERT(!list_link_active(&vd->vdev_dirty_node));
+
+ /*
+ * Free all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_free(vd->vdev_child[c]);
+
+ ASSERT(vd->vdev_child == NULL);
+ ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd == vd->vdev_top)
+ vdev_metaslab_fini(vd);
+
+ ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+
+ /*
+ * Remove this vdev from its parent's child list.
+ */
+ vdev_remove_child(vd->vdev_parent, vd);
+
+ ASSERT(vd->vdev_parent == NULL);
+
+ vdev_free_common(vd);
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+ spa_t *spa = svd->vdev_spa;
+ metaslab_t *msp;
+ vdev_t *vd;
+ int t;
+
+ ASSERT(tvd == tvd->vdev_top);
+
+ tvd->vdev_ms_array = svd->vdev_ms_array;
+ tvd->vdev_ms_shift = svd->vdev_ms_shift;
+ tvd->vdev_ms_count = svd->vdev_ms_count;
+
+ svd->vdev_ms_array = 0;
+ svd->vdev_ms_shift = 0;
+ svd->vdev_ms_count = 0;
+
+ tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_ms = svd->vdev_ms;
+
+ svd->vdev_mg = NULL;
+ svd->vdev_ms = NULL;
+
+ if (tvd->vdev_mg != NULL)
+ tvd->vdev_mg->mg_vd = tvd;
+
+ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+ tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
+
+ svd->vdev_stat.vs_alloc = 0;
+ svd->vdev_stat.vs_space = 0;
+ svd->vdev_stat.vs_dspace = 0;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+ }
+
+ if (list_link_active(&svd->vdev_dirty_node)) {
+ vdev_config_clean(svd);
+ vdev_config_dirty(tvd);
+ }
+
+ tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
+ svd->vdev_reopen_wanted = 0;
+
+ tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+ svd->vdev_deflate_ratio = 0;
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ vd->vdev_top = tvd;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+ spa_t *spa = cvd->vdev_spa;
+ vdev_t *pvd = cvd->vdev_parent;
+ vdev_t *mvd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+ mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_state = cvd->vdev_state;
+
+ vdev_remove_child(pvd, cvd);
+ vdev_add_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_children;
+ vdev_add_child(mvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (mvd == mvd->vdev_top)
+ vdev_top_transfer(cvd, mvd);
+
+ return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+ vdev_t *mvd = cvd->vdev_parent;
+ vdev_t *pvd = mvd->vdev_parent;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+
+ ASSERT(mvd->vdev_children == 1);
+ ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+ mvd->vdev_ops == &vdev_replacing_ops ||
+ mvd->vdev_ops == &vdev_spare_ops);
+ cvd->vdev_ashift = mvd->vdev_ashift;
+
+ vdev_remove_child(mvd, cvd);
+ vdev_remove_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_id;
+ vdev_add_child(pvd, cvd);
+ /*
+ * If we created a new toplevel vdev, then we need to change the child's
+ * vdev GUID to match the old toplevel vdev. Otherwise, we could have
+ * detached an offline device, and when we go to import the pool we'll
+ * think we have two toplevel vdevs, instead of a different version of
+ * the same toplevel vdev.
+ */
+ if (cvd->vdev_top == cvd) {
+ pvd->vdev_guid_sum -= cvd->vdev_guid;
+ cvd->vdev_guid_sum -= cvd->vdev_guid;
+ cvd->vdev_guid = mvd->vdev_guid;
+ cvd->vdev_guid_sum += mvd->vdev_guid;
+ pvd->vdev_guid_sum += cvd->vdev_guid;
+ }
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (cvd == cvd->vdev_top)
+ vdev_top_transfer(mvd, cvd);
+
+ ASSERT(mvd->vdev_children == 0);
+ vdev_free(mvd);
+}
+
+int
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ metaslab_class_t *mc = spa_metaslab_class_select(spa);
+ uint64_t m;
+ uint64_t oldc = vd->vdev_ms_count;
+ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+ metaslab_t **mspp;
+ int error;
+
+ if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ return (0);
+
+ dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
+
+ ASSERT(oldc <= newc);
+
+ if (vd->vdev_mg == NULL)
+ vd->vdev_mg = metaslab_group_create(mc, vd);
+
+ mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+ if (oldc != 0) {
+ bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+ }
+
+ vd->vdev_ms = mspp;
+ vd->vdev_ms_count = newc;
+
+ for (m = oldc; m < newc; m++) {
+ space_map_obj_t smo = { 0, 0, 0 };
+ if (txg == 0) {
+ uint64_t object = 0;
+ error = dmu_read(mos, vd->vdev_ms_array,
+ m * sizeof (uint64_t), sizeof (uint64_t), &object);
+ if (error)
+ return (error);
+ if (object != 0) {
+ dmu_buf_t *db;
+ error = dmu_bonus_hold(mos, object, FTAG, &db);
+ if (error)
+ return (error);
+ ASSERT3U(db->db_size, ==, sizeof (smo));
+ bcopy(db->db_data, &smo, db->db_size);
+ ASSERT3U(smo.smo_object, ==, object);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
+ m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+ }
+
+ return (0);
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+ uint64_t m;
+ uint64_t count = vd->vdev_ms_count;
+
+ if (vd->vdev_ms != NULL) {
+ for (m = 0; m < count; m++)
+ if (vd->vdev_ms[m] != NULL)
+ metaslab_fini(vd->vdev_ms[m]);
+ kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+ vd->vdev_ms = NULL;
+ }
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+ int error;
+ int c;
+ uint64_t osize = 0;
+ uint64_t asize, psize;
+ uint64_t ashift = 0;
+
+ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+ vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+ vd->vdev_state == VDEV_STATE_OFFLINE);
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
+ vd->vdev_fault_arg >>= 1;
+ else
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vdev_cache_init(vd);
+ vdev_queue_init(vd);
+ vd->vdev_cache_active = B_TRUE;
+ }
+
+ if (vd->vdev_offline) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
+ return (ENXIO);
+ }
+
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, ENXIO);
+
+ dprintf("%s = %d, osize %llu, state = %d\n",
+ vdev_description(vd), error, osize, vd->vdev_state);
+
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ vd->vdev_stat.vs_aux);
+ return (error);
+ }
+
+ vd->vdev_state = VDEV_STATE_HEALTHY;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_NONE);
+ break;
+ }
+
+ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+
+ if (vd->vdev_children == 0) {
+ if (osize < SPA_MINDEVSIZE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (EOVERFLOW);
+ }
+ psize = osize;
+ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+ } else {
+ if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
+ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (EOVERFLOW);
+ }
+ psize = 0;
+ asize = osize;
+ }
+
+ vd->vdev_psize = psize;
+
+ if (vd->vdev_asize == 0) {
+ /*
+ * This is the first-ever open, so use the computed values.
+ * For testing purposes, a higher ashift can be requested.
+ */
+ vd->vdev_asize = asize;
+ vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
+ } else {
+ /*
+ * Make sure the alignment requirement hasn't increased.
+ */
+ if (ashift > vd->vdev_top->vdev_ashift) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure the device hasn't shrunk.
+ */
+ if (asize < vd->vdev_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ asize > vd->vdev_asize) {
+ vd->vdev_asize = asize;
+ }
+ }
+
+ /*
+ * If this is a top-level vdev, compute the raidz-deflation
+ * ratio. Note, we hard-code in 128k (1<<17) because it is the
+ * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
+ * changes, this algorithm must never change, or we will
+ * inconsistently account for existing bp's.
+ */
+ if (vd->vdev_top == vd) {
+ vd->vdev_deflate_ratio = (1<<17) /
+ (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
+ }
+
+ /*
+ * This allows the ZFS DE to close cases appropriately. If a device
+ * goes away and later returns, we want to close the associated case.
+ * But it's not enough to simply post this only when a device goes from
+ * CANT_OPEN -> HEALTHY. If we reboot the system and the device is
+ * back, we also need to close the case (otherwise we will try to replay
+ * it). So we have to post this notifier every time. Since this only
+ * occurs during pool open or error recovery, this should not be an
+ * issue.
+ */
+ zfs_post_ok(vd->vdev_spa, vd);
+
+ return (0);
+}
+
+/*
+ * Called once the vdevs are all opened, this routine validates the label
+ * contents. This needs to be done before vdev_load() so that we don't
+ * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
+ * won't succeed if the device has been changed underneath.
+ *
+ * This function will only return failure if one of the vdevs indicates that it
+ * has since been destroyed or exported. This is only possible if
+ * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
+ * will be updated but the function will return 0.
+ */
+int
+vdev_validate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+ nvlist_t *label;
+ uint64_t guid;
+ uint64_t state;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vdev_validate(vd->vdev_child[c]) != 0)
+ return (-1);
+
+ /*
+ * If the device has already failed, or was marked offline, don't do
+ * any further validation. Otherwise, label I/O will fail and we will
+ * overwrite the previous state.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) {
+
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &guid) != 0 || guid != spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid != vd->vdev_guid) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ nvlist_free(label);
+
+ if (spa->spa_load_state == SPA_LOAD_OPEN &&
+ state != POOL_STATE_ACTIVE)
+ return (-1);
+ }
+
+ /*
+ * If we were able to open and validate a vdev that was previously
+ * marked permanently unavailable, clear that state now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ return (0);
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_cache_active) {
+ vdev_cache_fini(vd);
+ vdev_queue_fini(vd);
+ vd->vdev_cache_active = B_FALSE;
+ }
+
+ /*
+ * We record the previous state before we close it, so that if we are
+ * doing a reopen(), we don't generate FMA ereports if we notice that
+ * it's still faulted.
+ */
+ vd->vdev_prevstate = vd->vdev_state;
+
+ if (vd->vdev_offline)
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ else
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+}
+
+void
+vdev_reopen(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ vdev_close(vd);
+ (void) vdev_open(vd);
+
+ /*
+ * Call vdev_validate() here to make sure we have the same device.
+ * Otherwise, a device with an invalid label could be successfully
+ * opened in response to vdev_reopen().
+ *
+ * The downside to this is that if the user is simply experimenting by
+ * overwriting an entire disk, we'll fault the device rather than
+ * demonstrate self-healing capabilities. On the other hand, with
+ * proper FMA integration, the series of errors we'd see from the device
+ * would result in a faulted device anyway. Given that this doesn't
+ * model any real-world corruption, it's better to catch this here and
+ * correctly identify that the device has either changed beneath us, or
+ * is corrupted beyond recognition.
+ */
+ (void) vdev_validate(vd);
+
+ /*
+ * Reassess root vdev's health.
+ */
+ vdev_propagate_state(spa->spa_root_vdev);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
+{
+ int error;
+
+ /*
+ * Normally, partial opens (e.g. of a mirror) are allowed.
+ * For a create, however, we want to fail the request if
+ * there are any components we can't open.
+ */
+ error = vdev_open(vd);
+
+ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_close(vd);
+ return (error ? error : ENXIO);
+ }
+
+ /*
+ * Recursively initialize all labels.
+ */
+ if ((error = vdev_label_init(vd, txg, isreplacing ?
+ VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
+ vdev_close(vd);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * The is the latter half of vdev_create(). It is distinct because it
+ * involves initiating transactions in order to do metaslab creation.
+ * For creation, we want to try to create all vdevs at once and then undo it
+ * if anything fails; this is much harder if we have pending transactions.
+ */
+void
+vdev_init(vdev_t *vd, uint64_t txg)
+{
+ /*
+ * Aim for roughly 200 metaslabs per vdev.
+ */
+ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+ vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+
+ /*
+ * Initialize the vdev's metaslabs. This can't fail because
+ * there's nothing to read when creating all new metaslabs.
+ */
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+}
+
+void
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
+{
+ ASSERT(vd == vd->vdev_top);
+ ASSERT(ISP2(flags));
+
+ if (flags & VDD_METASLAB)
+ (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+ if (flags & VDD_DTL)
+ (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+ (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
+}
+
+void
+vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ mutex_enter(sm->sm_lock);
+ if (!space_map_contains(sm, txg, size))
+ space_map_add(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+}
+
+int
+vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ int dirty;
+
+ /*
+ * Quick test without the lock -- covers the common case that
+ * there are no dirty time segments.
+ */
+ if (sm->sm_space == 0)
+ return (0);
+
+ mutex_enter(sm->sm_lock);
+ dirty = space_map_contains(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+
+ return (dirty);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (vd->vdev_children == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ /*
+ * We're successfully scrubbed everything up to scrub_txg.
+ * Therefore, excise all old DTLs up to that point, then
+ * fold in the DTLs for everything we couldn't scrub.
+ */
+ if (scrub_txg != 0) {
+ space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
+ space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+ }
+ if (scrub_done)
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+ if (txg != 0)
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ return;
+ }
+
+ /*
+ * Make sure the DTLs are always correct under the scrub lock.
+ */
+ if (vd == spa->spa_root_vdev)
+ mutex_enter(&spa->spa_scrub_lock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
+ space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ }
+
+ if (vd == spa->spa_root_vdev)
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+vdev_dtl_load(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *db;
+ int error;
+
+ ASSERT(vd->vdev_children == 0);
+
+ if (smo->smo_object == 0)
+ return (0);
+
+ if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
+ return (error);
+
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(db->db_data, smo, db->db_size);
+ dmu_buf_rele(db, FTAG);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (error);
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ space_map_t *sm = &vd->vdev_dtl_map;
+ objset_t *mos = spa->spa_meta_objset;
+ space_map_t smsync;
+ kmutex_t smlock;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+
+ dprintf("%s in txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ if (vd->vdev_detached) {
+ if (smo->smo_object != 0) {
+ int err = dmu_object_free(mos, smo->smo_object, tx);
+ ASSERT3U(err, ==, 0);
+ smo->smo_object = 0;
+ }
+ dmu_tx_commit(tx);
+ dprintf("detach %s committed in txg %llu\n",
+ vdev_description(vd), txg);
+ return;
+ }
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ smo->smo_object = dmu_object_alloc(mos,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ bzero(&smlock, sizeof(smlock));
+ mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+
+ space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
+ &smlock);
+
+ mutex_enter(&smlock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_walk(sm, space_map_add, &smsync);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ space_map_truncate(smo, mos, tx);
+ space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
+
+ space_map_destroy(&smsync);
+
+ mutex_exit(&smlock);
+ mutex_destroy(&smlock);
+
+ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(smo, db->db_data, db->db_size);
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+}
+
+void
+vdev_load(vdev_t *vd)
+{
+ int c;
+
+ /*
+ * Recursively load all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_load(vd->vdev_child[c]);
+
+ /*
+ * If this is a top-level vdev, initialize its metaslabs.
+ */
+ if (vd == vd->vdev_top &&
+ (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
+ vdev_metaslab_init(vd, 0) != 0))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+
+ /*
+ * If this is a leaf vdev, load its DTL.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+}
+
+/*
+ * This special case of vdev_spare() is used for hot spares. It's sole purpose
+ * it to set the vdev state for the associated vdev. To do this, we make sure
+ * that we can open the underlying device, then try to read the label, and make
+ * sure that the label is sane and that it hasn't been repurposed to another
+ * pool.
+ */
+int
+vdev_validate_spare(vdev_t *vd)
+{
+ nvlist_t *label;
+ uint64_t guid, version;
+ uint64_t state;
+
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
+ version > ZFS_VERSION ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
+ guid != vd->vdev_guid ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (-1);
+ }
+
+ spa_spare_add(vd);
+
+ /*
+ * We don't actually check the pool state here. If it's in fact in
+ * use by another pool, we update this fact on the fly when requested.
+ */
+ nvlist_free(label);
+ return (0);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+ metaslab_t *msp;
+
+ dprintf("%s txg %llu\n", vdev_description(vd), txg);
+
+ while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ metaslab_sync_done(msp, txg);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *lvd;
+ metaslab_t *msp;
+ dmu_tx_t *tx;
+
+ dprintf("%s txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
+ ASSERT(vd == vd->vdev_top);
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+ ASSERT(vd->vdev_ms_array != 0);
+ vdev_config_dirty(vd);
+ dmu_tx_commit(tx);
+ }
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
+ metaslab_sync(msp, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+ }
+
+ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+ vdev_dtl_sync(lvd, txg);
+
+ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+void
+vdev_io_start(zio_t *zio)
+{
+ zio->io_vd->vdev_ops->vdev_op_io_start(zio);
+}
+
+void
+vdev_io_done(zio_t *zio)
+{
+ zio->io_vd->vdev_ops->vdev_op_io_done(zio);
+}
+
+const char *
+vdev_description(vdev_t *vd)
+{
+ if (vd == NULL || vd->vdev_ops == NULL)
+ return ("<unknown>");
+
+ if (vd->vdev_path != NULL)
+ return (vd->vdev_path);
+
+ if (vd->vdev_parent == NULL)
+ return (spa_name(vd->vdev_spa));
+
+ return (vd->vdev_ops->vdev_op_type);
+}
+
+int
+vdev_online(spa_t *spa, uint64_t guid)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ dprintf("ONLINE: %s\n", vdev_description(vd));
+
+ vd->vdev_offline = B_FALSE;
+ vd->vdev_tmpoffline = B_FALSE;
+ vdev_reopen(vd->vdev_top);
+
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+int
+vdev_offline(spa_t *spa, uint64_t guid, int istmp)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ dprintf("OFFLINE: %s\n", vdev_description(vd));
+
+ /*
+ * If the device isn't already offline, try to offline it.
+ */
+ if (!vd->vdev_offline) {
+ /*
+ * If this device's top-level vdev has a non-empty DTL,
+ * don't allow the device to be offlined.
+ *
+ * XXX -- make this more precise by allowing the offline
+ * as long as the remaining devices don't have any DTL holes.
+ */
+ if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * Offline this device and reopen its top-level vdev.
+ * If this action results in the top-level vdev becoming
+ * unusable, undo it and fail the request.
+ */
+ vd->vdev_offline = B_TRUE;
+ vdev_reopen(vd->vdev_top);
+ if (vdev_is_dead(vd->vdev_top)) {
+ vd->vdev_offline = B_FALSE;
+ vdev_reopen(vd->vdev_top);
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ }
+ }
+
+ vd->vdev_tmpoffline = istmp;
+
+ vdev_config_dirty(vd->vdev_top);
+
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+}
+
+/*
+ * Clear the error counts associated with this vdev. Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked. We also clear all
+ * children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ vd = spa->spa_root_vdev;
+
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_clear(spa, vd->vdev_child[c]);
+}
+
+int
+vdev_is_dead(vdev_t *vd)
+{
+ return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+}
+
+int
+vdev_error_inject(vdev_t *vd, zio_t *zio)
+{
+ int error = 0;
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
+ return (0);
+
+ if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
+ return (0);
+
+ switch (vd->vdev_fault_mode) {
+ case VDEV_FAULT_RANDOM:
+ if (spa_get_random(vd->vdev_fault_arg) == 0)
+ error = EIO;
+ break;
+
+ case VDEV_FAULT_COUNT:
+ if ((int64_t)--vd->vdev_fault_arg <= 0)
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+ error = EIO;
+ break;
+ }
+
+ if (error != 0) {
+ dprintf("returning %d for type %d on %s state %d offset %llx\n",
+ error, zio->io_type, vdev_description(vd),
+ vd->vdev_state, zio->io_offset);
+ }
+
+ return (error);
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int c, t;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_rsize(vd);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ for (t = 0; t < ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+ vs->vs_read_errors += cvs->vs_read_errors;
+ vs->vs_write_errors += cvs->vs_write_errors;
+ vs->vs_checksum_errors += cvs->vs_checksum_errors;
+ vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ vs->vs_scrub_errors += cvs->vs_scrub_errors;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ }
+}
+
+void
+vdev_stat_update(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *pvd;
+ uint64_t txg = zio->io_txg;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ zio_type_t type = zio->io_type;
+ int flags = zio->io_flags;
+
+ if (zio->io_error == 0) {
+ if (!(flags & ZIO_FLAG_IO_BYPASS)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_ops[type]++;
+ vs->vs_bytes[type] += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ if ((flags & ZIO_FLAG_IO_REPAIR) &&
+ zio->io_delegate_list == NULL) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (flags & ZIO_FLAG_SCRUB_THREAD)
+ vs->vs_scrub_repaired += zio->io_size;
+ else
+ vs->vs_self_healed += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ return;
+ }
+
+ if (flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ if (!vdev_is_dead(vd)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (type == ZIO_TYPE_READ) {
+ if (zio->io_error == ECKSUM)
+ vs->vs_checksum_errors++;
+ else
+ vs->vs_read_errors++;
+ }
+ if (type == ZIO_TYPE_WRITE)
+ vs->vs_write_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (type == ZIO_TYPE_WRITE) {
+ if (txg == 0 || vd->vdev_children != 0)
+ return;
+ if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+ }
+ if (!(flags & ZIO_FLAG_IO_REPAIR)) {
+ if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+ return;
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+ }
+ }
+}
+
+void
+vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
+{
+ int c;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
+
+ mutex_enter(&vd->vdev_stat_lock);
+
+ if (type == POOL_SCRUB_NONE) {
+ /*
+ * Update completion and end time. Leave everything else alone
+ * so we can report what happened during the previous scrub.
+ */
+ vs->vs_scrub_complete = complete;
+ vs->vs_scrub_end = gethrestime_sec();
+ } else {
+ vs->vs_scrub_type = type;
+ vs->vs_scrub_complete = 0;
+ vs->vs_scrub_examined = 0;
+ vs->vs_scrub_repaired = 0;
+ vs->vs_scrub_errors = 0;
+ vs->vs_scrub_start = gethrestime_sec();
+ vs->vs_scrub_end = 0;
+ }
+
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
+{
+ ASSERT(vd == vd->vdev_top);
+ int64_t dspace_delta = space_delta;
+
+ do {
+ if (vd->vdev_ms_count) {
+ /*
+ * If this is a top-level vdev, apply the
+ * inverse of its psize-to-asize (ie. RAID-Z)
+ * space-expansion factor. We must calculate
+ * this here and not at the root vdev because
+ * the root vdev's psize-to-asize is simply the
+ * max of its childrens', thus not accurate
+ * enough for us.
+ */
+ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+ dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ }
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space += space_delta;
+ vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&vd->vdev_stat_lock);
+ } while ((vd = vd->vdev_parent) != NULL);
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ /*
+ * The dirty list is protected by the config lock. The caller must
+ * either hold the config lock as writer, or must be the sync thread
+ * (which holds the lock as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, RW_WRITER) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_config_dirty(rvd->vdev_child[c]);
+ } else {
+ ASSERT(vd == vd->vdev_top);
+
+ if (!list_link_active(&vd->vdev_dirty_node))
+ list_insert_head(&spa->spa_dirty_list, vd);
+ }
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, RW_WRITER) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ ASSERT(list_link_active(&vd->vdev_dirty_node));
+ list_remove(&spa->spa_dirty_list, vd);
+}
+
+void
+vdev_propagate_state(vdev_t *vd)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int degraded = 0, faulted = 0;
+ int corrupted = 0;
+ int c;
+ vdev_t *child;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+ if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+ faulted++;
+ else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ degraded++;
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
+
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a toplevel vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+}
+
+/*
+ * Set a vdev's state. If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
+ */
+void
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
+{
+ uint64_t save_state;
+
+ if (state == vd->vdev_state) {
+ vd->vdev_stat.vs_aux = aux;
+ return;
+ }
+
+ save_state = vd->vdev_state;
+
+ vd->vdev_state = state;
+ vd->vdev_stat.vs_aux = aux;
+
+ if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we fail to open a vdev during an import, we mark it as
+ * "not available", which signifies that it was never there to
+ * begin with. Failure to open such a device is not considered
+ * an error.
+ */
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_not_present = 1;
+
+ /*
+ * Post the appropriate ereport. If the 'prevstate' field is
+ * set to something other than VDEV_STATE_UNKNOWN, it indicates
+ * that this is part of a vdev_reopen(). In this case, we don't
+ * want to post the ereport if the device was already in the
+ * CANT_OPEN state beforehand.
+ */
+ if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
+ vd != vd->vdev_spa->spa_root_vdev) {
+ const char *class;
+
+ switch (aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+ break;
+ case VDEV_AUX_CORRUPT_DATA:
+ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+ break;
+ case VDEV_AUX_NO_REPLICAS:
+ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+ break;
+ case VDEV_AUX_BAD_GUID_SUM:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+ break;
+ case VDEV_AUX_TOO_SMALL:
+ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+ break;
+ case VDEV_AUX_BAD_LABEL:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+ break;
+ default:
+ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+ }
+
+ zfs_ereport_post(class, vd->vdev_spa,
+ vd, NULL, save_state, 0);
+ }
+ }
+
+ if (isopen)
+ return;
+
+ if (vd->vdev_parent != NULL)
+ vdev_propagate_state(vd->vdev_parent);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
new file mode 100644
index 000000000000..b4fb96046945
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -0,0 +1,394 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 256 back-to-back 512-byte
+ * reads into a single 128k read followed by 255 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 128k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. It could also
+ * take advantage of semantic information about the I/O. And it could use
+ * something faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer. At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ */
+int zfs_vdev_cache_max = 1<<14;
+int zfs_vdev_cache_size = 10ULL << 20;
+int zfs_vdev_cache_bshift = 16;
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
+TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_size, 0, "Size of VDEV cache");
+
+#define VCBS (1 << zfs_vdev_cache_bshift)
+
+static int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_offset < ve2->ve_offset)
+ return (-1);
+ if (ve1->ve_offset > ve2->ve_offset)
+ return (1);
+ return (0);
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_lastused < ve2->ve_lastused)
+ return (-1);
+ if (ve1->ve_lastused > ve2->ve_lastused)
+ return (1);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+ ASSERT(ve->ve_data != NULL);
+
+ dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
+ vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
+ ve->ve_hits, ve->ve_missed_update);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ zio_buf_free(ve->ve_data, VCBS);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (zfs_vdev_cache_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+ zfs_vdev_cache_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL) {
+ dprintf("can't evict in %p, still filling\n", vc);
+ return (NULL);
+ }
+ ASSERT(ve->ve_hits != 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = lbolt;
+ ve->ve_data = zio_buf_alloc(VCBS);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+
+ if (ve->ve_lastused != lbolt) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = lbolt;
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = zio->io_private;
+ zio_t *dio;
+
+ ASSERT(zio->io_size == VCBS);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT(ve->ve_fill_io == zio);
+ ASSERT(ve->ve_offset == zio->io_offset);
+ ASSERT(ve->ve_data == zio->io_data);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
+ vdev_cache_hit(vc, ve, dio);
+
+ if (zio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+
+ while ((dio = zio->io_delegate_list) != NULL) {
+ zio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = zio->io_error;
+ zio_next_stage(dio);
+ }
+}
+
+/*
+ * Read data from the cache. Returns 0 on cache hit, errno on a miss.
+ */
+int
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+ zio_t *fio;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (EINVAL);
+
+ if (zio->io_size > zfs_vdev_cache_max)
+ return (EOVERFLOW);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
+ return (EXDEV);
+
+ ASSERT(cache_phase + zio->io_size <= VCBS);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (ESTALE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio->io_delegate_next = fio->io_delegate_list;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+ mutex_exit(&vc->vc_lock);
+ return (0);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_next_stage(zio);
+ return (0);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (ENOMEM);
+ }
+
+ fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
+ vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+
+ return (0);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, VCBS);
+ uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+ avl_index_t where;
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ bcopy((char *)zio->io_data + start - io_start,
+ ve->ve_data + start - ve->ve_offset, end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
new file mode 100644
index 000000000000..b965b1c5f09f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -0,0 +1,363 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunldi.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+typedef struct vdev_disk_buf {
+ buf_t vdb_buf;
+ zio_t *vdb_io;
+} vdev_disk_buf_t;
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_disk_t *dvd;
+ struct dk_minfo dkm;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ /*
+ * When opening a disk device, we want to preserve the user's original
+ * intent. We always want to open the device by the path the user gave
+ * us, even if it is one of multiple paths to the save device. But we
+ * also want to be able to survive disks being removed/recabled.
+ * Therefore the sequence of opening devices is:
+ *
+ * 1. Try opening the device by path. For legacy pools without the
+ * 'whole_disk' property, attempt to fix the path by appending 's0'.
+ *
+ * 2. If the devid of the device matches the stored value, return
+ * success.
+ *
+ * 3. Otherwise, the device may have moved. Try opening the device
+ * by the devid instead.
+ *
+ */
+ if (vd->vdev_devid != NULL) {
+ if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+ &dvd->vd_minor) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+ }
+
+ error = EINVAL; /* presume failure */
+
+ if (vd->vdev_path != NULL) {
+ ddi_devid_t devid;
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+ ldi_handle_t lh;
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ if (ldi_open_by_name(buf, spa_mode, kcred,
+ &lh, zfs_li) == 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+ vd->vdev_wholedisk = 1ULL;
+ (void) ldi_close(lh, spa_mode, kcred);
+ } else {
+ kmem_free(buf, len);
+ }
+ }
+
+ error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+ &dvd->vd_lh, zfs_li);
+
+ /*
+ * Compare the devid to the stored value.
+ */
+ if (error == 0 && vd->vdev_devid != NULL &&
+ ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+ if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+ error = EINVAL;
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+ dvd->vd_lh = NULL;
+ }
+ ddi_devid_free(devid);
+ }
+
+ /*
+ * If we succeeded in opening the device, but 'vdev_wholedisk'
+ * is not yet set, then this must be a slice.
+ */
+ if (error == 0 && vd->vdev_wholedisk == -1ULL)
+ vd->vdev_wholedisk = 0;
+ }
+
+ /*
+ * If we were unable to open by path, or the devid check fails, open by
+ * devid instead.
+ */
+ if (error != 0 && vd->vdev_devid != NULL)
+ error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+ spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ /*
+ * Determine the actual size of the device.
+ */
+ if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (EINVAL);
+ }
+
+ /*
+ * If we own the whole disk, try to enable disk write caching.
+ * We ignore errors because it's OK if we can't do it.
+ */
+ if (vd->vdev_wholedisk == 1) {
+ int wce = 1;
+ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
+ FKIOCTL, kcred, NULL);
+ }
+
+ /*
+ * Determine the device's minimum transfer size.
+ * If the ioctl isn't supported, assume DEV_BSIZE.
+ */
+ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
+ FKIOCTL, kcred, NULL) != 0)
+ dkm.dki_lbsize = DEV_BSIZE;
+
+ *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
+
+ /*
+ * Clear the nowritecache bit, so that on a vdev_reopen() we will
+ * try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (dvd == NULL)
+ return;
+
+ dprintf("removing disk %s, devid %s\n",
+ vd->vdev_path ? vd->vdev_path : "<none>",
+ vd->vdev_devid ? vd->vdev_devid : "<none>");
+
+ if (dvd->vd_minor != NULL)
+ ddi_devid_str_free(dvd->vd_minor);
+
+ if (dvd->vd_devid != NULL)
+ ddi_devid_free(dvd->vd_devid);
+
+ if (dvd->vd_lh != NULL)
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+
+ kmem_free(dvd, sizeof (vdev_disk_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+ vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
+ zio_t *zio = vdb->vdb_io;
+
+ if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+ zio->io_error = EIO;
+
+ kmem_free(vdb, sizeof (vdev_disk_buf_t));
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+ zio_t *zio = zio_arg;
+
+ zio->io_error = error;
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_buf_t *vdb;
+ buf_t *bp;
+ int flags, error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ if (vd->vdev_nowritecache) {
+ zio->io_error = ENOTSUP;
+ break;
+ }
+
+ zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
+ zio->io_dk_callback.dkc_cookie = zio;
+
+ error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+ (uintptr_t)&zio->io_dk_callback,
+ FKIOCTL, kcred, NULL);
+
+ if (error == 0) {
+ /*
+ * The ioctl will be done asychronously,
+ * and will call vdev_disk_ioctl_done()
+ * upon completion.
+ */
+ return;
+ } else if (error == ENOTSUP) {
+ /*
+ * If we get ENOTSUP, we know that no future
+ * attempts will ever succeed. In this case we
+ * set a persistent bit so that we don't bother
+ * with the ioctl in the future.
+ */
+ vd->vdev_nowritecache = B_TRUE;
+ }
+ zio->io_error = error;
+
+ break;
+
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+ flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+ flags |= B_BUSY | B_NOCACHE;
+ if (zio->io_flags & ZIO_FLAG_FAILFAST)
+ flags |= B_FAILFAST;
+
+ vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+
+ vdb->vdb_io = zio;
+ bp = &vdb->vdb_buf;
+
+ bioinit(bp);
+ bp->b_flags = flags;
+ bp->b_bcount = zio->io_size;
+ bp->b_un.b_addr = zio->io_data;
+ bp->b_lblkno = lbtodb(zio->io_offset);
+ bp->b_bufsize = zio->io_size;
+ bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+ /* XXPOLICY */
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error) {
+ zio->io_error = error;
+ bioerror(bp, error);
+ bp->b_resid = bp->b_bcount;
+ bp->b_iodone(bp);
+ return;
+ }
+
+ error = ldi_strategy(dvd->vd_lh, bp);
+ /* ldi_strategy() will return non-zero only on programming errors */
+ ASSERT(error == 0);
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_disk_open,
+ vdev_disk_close,
+ vdev_default_asize,
+ vdev_disk_io_start,
+ vdev_disk_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
new file mode 100644
index 000000000000..b8e79f8c0c78
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -0,0 +1,225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ vnode_t *vp;
+ vattr_t vattr;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
+ 0, &vp, 0, 0, rootdir);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (vp->v_type != VREG) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (ENODEV);
+ }
+#endif
+
+ /*
+ * Determine the physical size of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, kcred);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *psize = vattr.va_size;
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vf == NULL)
+ return;
+
+ if (vf->vf_vnode != NULL) {
+ (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
+ (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
+ VN_RELE(vf->vf_vnode);
+ }
+
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ int error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+ kcred);
+ dprintf("fsync(%s) = %d\n", vdev_description(vd),
+ zio->io_error);
+ break;
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ /*
+ * In the kernel, don't bother double-caching, but in userland,
+ * we want to test the vdev_cache code.
+ */
+#ifndef _KERNEL
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+#endif
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+ /* XXPOLICY */
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error) {
+ zio->io_error = error;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
+ zio->io_size, zio->io_offset, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_file_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+#ifndef _KERNEL
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+#endif
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_file_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_FILE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
new file mode 100644
index 000000000000..969917175f1a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -0,0 +1,432 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+struct g_class zfs_vdev_class = {
+ .name = "ZFS::VDEV",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+typedef struct vdev_geom_ctx {
+ struct g_consumer *gc_consumer;
+ int gc_state;
+ struct bio_queue_head gc_queue;
+ struct mtx gc_queue_mtx;
+} vdev_geom_ctx_t;
+
+static void
+vdev_geom_release(vdev_t *vd)
+{
+ vdev_geom_ctx_t *ctx;
+
+ ctx = vd->vdev_tsd;
+ vd->vdev_tsd = NULL;
+
+ mtx_lock(&ctx->gc_queue_mtx);
+ ctx->gc_state = 1;
+ wakeup_one(&ctx->gc_queue);
+ while (ctx->gc_state != 2)
+ msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0);
+ mtx_unlock(&ctx->gc_queue_mtx);
+ mtx_destroy(&ctx->gc_queue_mtx);
+ kmem_free(ctx, sizeof(*ctx));
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+ struct g_geom *gp;
+ vdev_t *vd;
+ int error;
+
+ g_topology_assert();
+
+ vd = cp->private;
+ gp = cp->geom;
+ error = cp->provider->error;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ g_access(cp, -cp->acr, -cp->acw, -cp->ace);
+ g_detach(cp);
+ ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
+ g_destroy_consumer(cp);
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(cp->geom, error);
+ }
+ vdev_geom_release(vd);
+ /* Both methods below work, but in a bit different way. */
+#if 0
+ vd->vdev_reopen_wanted = 1;
+#else
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux);
+#endif
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, int write)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Attaching to %s.", pp->name);
+ /* Do we have geom already? No? Create one. */
+ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+ if (!(gp->flags & G_GEOM_WITHER))
+ break;
+ }
+ if (gp == NULL) {
+ gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+ gp->orphan = vdev_geom_orphan;
+ cp = g_new_consumer(gp);
+ if (g_attach(cp, pp) != 0) {
+ g_wither_geom(gp, ENXIO);
+ return (NULL);
+ }
+ if (g_access(cp, 1, write, 1) != 0) {
+ g_wither_geom(gp, ENXIO);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+ } else {
+ /* Check if we are already connected to this provider. */
+ LIST_FOREACH(cp, &gp->consumer, consumer) {
+ if (cp->provider == pp) {
+ ZFS_LOG(1, "Found consumer for %s.", pp->name);
+ break;
+ }
+ }
+ if (cp == NULL) {
+ cp = g_new_consumer(gp);
+ if (g_attach(cp, pp) != 0) {
+ g_destroy_consumer(cp);
+ return (NULL);
+ }
+ if (g_access(cp, 1, write, 1) != 0) {
+ g_detach(cp);
+ g_destroy_consumer(cp);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created consumer for %s.", pp->name);
+ } else {
+ if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0)
+ return (NULL);
+ ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+ }
+ }
+ return (cp);
+}
+
+static void
+vdev_geom_detach(void *arg, int flag __unused)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+ cp = arg;
+ gp = cp->geom;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ g_access(cp, -1, 0, -1);
+ /* Destroy consumer on last close. */
+ if (cp->acr == 0 && cp->ace == 0) {
+ ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
+ if (cp->acw > 0)
+ g_access(cp, 0, -cp->acw, 0);
+ g_detach(cp);
+ g_destroy_consumer(cp);
+ }
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(gp, ENXIO);
+ }
+}
+
+static void
+vdev_geom_worker(void *arg)
+{
+ vdev_geom_ctx_t *ctx;
+ zio_t *zio;
+ struct bio *bp;
+
+ ctx = arg;
+ for (;;) {
+ mtx_lock(&ctx->gc_queue_mtx);
+ bp = bioq_takefirst(&ctx->gc_queue);
+ if (bp == NULL) {
+ if (ctx->gc_state == 1) {
+ ctx->gc_state = 2;
+ wakeup_one(&ctx->gc_state);
+ mtx_unlock(&ctx->gc_queue_mtx);
+ kthread_exit(0);
+ }
+ msleep(&ctx->gc_queue, &ctx->gc_queue_mtx,
+ PRIBIO | PDROP, "vgeom:io", 0);
+ continue;
+ }
+ mtx_unlock(&ctx->gc_queue_mtx);
+ zio = bp->bio_caller1;
+ zio->io_error = bp->bio_error;
+ if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
+ vdev_t *vd;
+
+ /*
+ * If we get ENOTSUP, we know that no future
+ * attempts will ever succeed. In this case we
+ * set a persistent bit so that we don't bother
+ * with the ioctl in the future.
+ */
+ vd = zio->io_vd;
+ vd->vdev_nowritecache = B_TRUE;
+ }
+ g_destroy_bio(bp);
+ zio_next_stage_async(zio);
+ }
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_geom_ctx_t *ctx;
+ struct g_provider *pp;
+ struct g_consumer *cp;
+ int owned;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ if ((owned = mtx_owned(&Giant)))
+ mtx_unlock(&Giant);
+ g_topology_lock();
+ pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
+ if (pp == NULL) {
+ g_topology_unlock();
+ if (owned)
+ mtx_lock(&Giant);
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+ cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE));
+ g_topology_unlock();
+ if (owned)
+ mtx_lock(&Giant);
+ if (cp == NULL) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (EACCES);
+ }
+
+ /*
+ * Determine the actual size of the device.
+ */
+ *psize = pp->mediasize;
+
+ /*
+ * Determine the device's minimum transfer size.
+ */
+ *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+
+ /*
+ * Clear the nowritecache bit, so that on a vdev_reopen() we will
+ * try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ cp->private = vd;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
+ bioq_init(&ctx->gc_queue);
+ mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
+ ctx->gc_consumer = cp;
+ ctx->gc_state = 0;
+
+ vd->vdev_tsd = ctx;
+
+ kthread_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
+ pp->name);
+
+ return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+ vdev_geom_ctx_t *ctx;
+ struct g_consumer *cp;
+
+ if ((ctx = vd->vdev_tsd) == NULL)
+ return;
+ if ((cp = ctx->gc_consumer) == NULL)
+ return;
+ vdev_geom_release(vd);
+ g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+ vdev_geom_ctx_t *ctx;
+ zio_t *zio;
+
+ zio = bp->bio_caller1;
+ ctx = zio->io_vd->vdev_tsd;
+
+ mtx_lock(&ctx->gc_queue_mtx);
+ bioq_insert_tail(&ctx->gc_queue, bp);
+ wakeup_one(&ctx->gc_queue);
+ mtx_unlock(&ctx->gc_queue_mtx);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+ vdev_t *vd;
+ vdev_geom_ctx_t *ctx;
+ struct g_consumer *cp;
+ struct bio *bp;
+ int error;
+
+ cp = NULL;
+
+ vd = zio->io_vd;
+ ctx = vd->vdev_tsd;
+ if (ctx != NULL)
+ cp = ctx->gc_consumer;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+ if (vd->vdev_nowritecache) {
+ zio->io_error = ENOTSUP;
+ break;
+ }
+
+ goto sendreq;
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+sendreq:
+
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error == 0 && cp == NULL)
+ error = ENXIO;
+ if (error) {
+ zio->io_error = error;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ bp = g_alloc_bio();
+ bp->bio_caller1 = zio;
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
+ bp->bio_data = zio->io_data;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
+ case ZIO_TYPE_IOCTL:
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
+ break;
+ }
+ bp->bio_done = vdev_geom_io_intr;
+
+ g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_geom_ops = {
+ vdev_geom_open,
+ vdev_geom_close,
+ vdev_default_asize,
+ vdev_geom_io_start,
+ vdev_geom_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
new file mode 100644
index 000000000000..9d9f5556fa08
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -0,0 +1,1011 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ * 1. Uniquely identify this device as part of a ZFS pool and confirm its
+ * identity within the pool.
+ *
+ * 2. Verify that all the devices given in a configuration are present
+ * within the pool.
+ *
+ * 3. Determine the uberblock for the pool.
+ *
+ * 4. In case of an import operation, determine the configuration of the
+ * toplevel vdev of which it is a part.
+ *
+ * 5. If an import operation cannot find all the devices in the pool,
+ * provide enough information to the administrator to determine which
+ * devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases. The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point. To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced. Assuming we have
+ * labels and an uberblock with the following transacation groups:
+ *
+ * L1 UB L2
+ * +------+ +------+ +------+
+ * | | | | | |
+ * | t10 | | t10 | | t10 |
+ * | | | | | |
+ * +------+ +------+ +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10). Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group. If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid. If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced. If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool. This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure. The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information. It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated. When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+ ASSERT(offset < sizeof (vdev_label_t));
+
+ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_read_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+}
+
+static void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_write_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
+ boolean_t isspare)
+{
+ nvlist_t *nv = NULL;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+ vd->vdev_ops->vdev_op_type) == 0);
+ if (!isspare)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
+ == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+
+ if (vd->vdev_path != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
+ vd->vdev_path) == 0);
+
+ if (vd->vdev_devid != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
+ vd->vdev_devid) == 0);
+
+ if (vd->vdev_nparity != 0) {
+ ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
+ VDEV_TYPE_RAIDZ) == 0);
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vd->vdev_nparity == 1 ||
+ (vd->vdev_nparity == 2 &&
+ spa_version(spa) >= ZFS_VERSION_RAID6));
+
+ /*
+ * Note that we'll add the nparity tag even on storage pools
+ * that only support a single parity device -- older software
+ * will just ignore it.
+ */
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ vd->vdev_nparity) == 0);
+ }
+
+ if (vd->vdev_wholedisk != -1ULL)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk) == 0);
+
+ if (vd->vdev_not_present)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+
+ if (vd->vdev_isspare)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+
+ if (!isspare && vd == vd->vdev_top) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ vd->vdev_ashift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize) == 0);
+ }
+
+ if (vd->vdev_dtl.smo_object != 0)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ vd->vdev_dtl.smo_object) == 0);
+
+ if (getstats) {
+ vdev_stat_t vs;
+ vdev_get_stats(vd, &vs);
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t **child;
+ int c;
+
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ child[c] = vdev_config_generate(spa, vd->vdev_child[c],
+ getstats, isspare);
+
+ VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, vd->vdev_children) == 0);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ nvlist_free(child[c]);
+
+ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+
+ } else {
+ if (vd->vdev_offline && !vd->vdev_tmpoffline)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ B_TRUE) == 0);
+ else
+ (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
+ DATA_TYPE_UINT64);
+ }
+
+ return (nv);
+}
+
+nvlist_t *
+vdev_label_read_config(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *config = NULL;
+ vdev_phys_t *vp;
+ zio_t *zio;
+ int l;
+
+ ASSERT(spa_config_held(spa, RW_READER));
+
+ if (vdev_is_dead(vd))
+ return (NULL);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
+
+ vdev_label_read(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ if (zio_wait(zio) == 0 &&
+ nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+ &config, 0) == 0)
+ break;
+
+ if (config != NULL) {
+ nvlist_free(config);
+ config = NULL;
+ }
+ }
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ return (config);
+}
+
+/*
+ * Determine if a device is in use. The 'spare_guid' parameter will be filled
+ * in with the device guid if this spare is active elsewhere on the system.
+ */
+static boolean_t
+vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
+ uint64_t *spare_guid)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t state, pool_guid, device_guid, txg, spare_pool;
+ uint64_t vdtxg = 0;
+ nvlist_t *label;
+
+ if (spare_guid)
+ *spare_guid = 0ULL;
+
+ /*
+ * Read the label, if any, and perform some basic sanity checks.
+ */
+ if ((label = vdev_label_read_config(vd)) == NULL)
+ return (B_FALSE);
+
+ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ &vdtxg);
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &device_guid) != 0) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ if (state != POOL_STATE_SPARE &&
+ (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0)) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Check to see if this device indeed belongs to the pool it claims to
+ * be a part of. The only way this is allowed is if the device is a hot
+ * spare (which we check for later on).
+ */
+ if (state != POOL_STATE_SPARE &&
+ !spa_guid_exists(pool_guid, device_guid) &&
+ !spa_spare_exists(device_guid, NULL))
+ return (B_FALSE);
+
+ /*
+ * If the transaction group is zero, then this an initialized (but
+ * unused) label. This is only an error if the create transaction
+ * on-disk is the same as the one we're using now, in which case the
+ * user has attempted to add the same vdev multiple times in the same
+ * transaction.
+ */
+ if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
+ return (B_TRUE);
+
+ /*
+ * Check to see if this is a spare device. We do an explicit check for
+ * spa_has_spare() here because it may be on our pending list of spares
+ * to add.
+ */
+ if (spa_spare_exists(device_guid, &spare_pool) ||
+ spa_has_spare(spa, device_guid)) {
+ if (spare_guid)
+ *spare_guid = device_guid;
+
+ switch (reason) {
+ case VDEV_LABEL_CREATE:
+ return (B_TRUE);
+
+ case VDEV_LABEL_REPLACE:
+ return (!spa_has_spare(spa, device_guid) ||
+ spare_pool != 0ULL);
+
+ case VDEV_LABEL_SPARE:
+ return (spa_has_spare(spa, device_guid));
+ }
+ }
+
+ /*
+ * If the device is marked ACTIVE, then this device is in use by another
+ * pool on the system.
+ */
+ return (state == POOL_STATE_ACTIVE);
+}
+
+/*
+ * Initialize a vdev label. We check to make sure each leaf device is not in
+ * use, and writable. We put down an initial label which we will later
+ * overwrite with a complete label. Note that it's important to do this
+ * sequentially, not in parallel, so that we catch cases of multiple use of the
+ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
+ * itself.
+ */
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ vdev_boot_header_t *vb;
+ uberblock_t *ub;
+ zio_t *zio;
+ int l, c, n;
+ char *buf;
+ size_t buflen;
+ int error;
+ uint64_t spare_guid;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_label_init(vd->vdev_child[c],
+ crtxg, reason)) != 0)
+ return (error);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ /*
+ * Dead vdevs cannot be initialized.
+ */
+ if (vdev_is_dead(vd))
+ return (EIO);
+
+ /*
+ * Determine if the vdev is in use.
+ */
+ if (reason != VDEV_LABEL_REMOVE &&
+ vdev_inuse(vd, crtxg, reason, &spare_guid))
+ return (EBUSY);
+
+ ASSERT(reason != VDEV_LABEL_REMOVE ||
+ vdev_inuse(vd, crtxg, reason, NULL));
+
+ /*
+ * If this is a request to add or replace a spare that is in use
+ * elsewhere on the system, then we must update the guid (which was
+ * initialized to a random value) to reflect the actual GUID (which is
+ * shared between multiple pools).
+ */
+ if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ for (; pvd != NULL; pvd = pvd->vdev_parent) {
+ pvd->vdev_guid_sum -= vd->vdev_guid;
+ pvd->vdev_guid_sum += spare_guid;
+ }
+
+ vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding a spare, then it's already
+ * labelled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_SPARE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
+ /*
+ * Initialize its label.
+ */
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ /*
+ * Generate a label describing the pool and our top-level vdev.
+ * We mark it as being from txg 0 to indicate that it's not
+ * really part of an active pool just yet. The labels will
+ * be written again with a meaningful txg by spa_sync().
+ */
+ if (reason == VDEV_LABEL_SPARE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+ /*
+ * For inactive hot spares, we generate a special label that
+ * identifies as a mutually shared hot spare. We write the
+ * label if we are adding a hot spare, or if we are removing an
+ * active hot spare (in which case we want to revert the
+ * labels).
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_SPARE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else {
+ label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+
+ /*
+ * Add our creation time. This allows us to detect multiple
+ * vdev uses as described above, and automatically expires if we
+ * fail.
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ crtxg) == 0);
+ }
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+ if (error != 0) {
+ nvlist_free(label);
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ /* EFAULT means nvlist_pack ran out of room */
+ return (error == EFAULT ? ENAMETOOLONG : EINVAL);
+ }
+
+ /*
+ * Initialize boot block header.
+ */
+ vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
+ bzero(vb, sizeof (vdev_boot_header_t));
+ vb->vb_magic = VDEV_BOOT_MAGIC;
+ vb->vb_version = VDEV_BOOT_VERSION;
+ vb->vb_offset = VDEV_BOOT_OFFSET;
+ vb->vb_size = VDEV_BOOT_SIZE;
+
+ /*
+ * Initialize uberblock template.
+ */
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
+ bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ *ub = spa->spa_uberblock;
+ ub->ub_txg = 0;
+
+ /*
+ * Write everything in parallel.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ vdev_label_write(zio, vd, l, vb,
+ offsetof(vdev_label_t, vl_boot_header),
+ sizeof (vdev_boot_header_t), NULL, NULL);
+
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_write(zio, vd, l, ub,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
+ }
+ }
+
+ error = zio_wait(zio);
+
+ nvlist_free(label);
+ zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+ zio_buf_free(vb, sizeof (vdev_boot_header_t));
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ /*
+ * If this vdev hasn't been previously identified as a spare, then we
+ * mark it as such only if a) we are labelling it as a spare, or b) it
+ * exists as a spare elsewhere in the system.
+ */
+ if (error == 0 && !vd->vdev_isspare &&
+ (reason == VDEV_LABEL_SPARE ||
+ spa_spare_exists(vd->vdev_guid, NULL)))
+ spa_spare_add(vd);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk. We've
+ * written the first uberblock for txg + 1, and then we lose power. When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline. If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a new seconds we'll have two
+ * conflicting uberblocks on disk with the same txg. The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+ if (ub1->ub_txg < ub2->ub_txg)
+ return (-1);
+ if (ub1->ub_txg > ub2->ub_txg)
+ return (1);
+
+ if (ub1->ub_timestamp < ub2->ub_timestamp)
+ return (-1);
+ if (ub1->ub_timestamp > ub2->ub_timestamp)
+ return (1);
+
+ return (0);
+}
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+ uberblock_t *ub = zio->io_data;
+ uberblock_t *ubbest = zio->io_private;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+
+ if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
+ mutex_enter(&spa->spa_uberblock_lock);
+ if (vdev_uberblock_compare(ub, ubbest) > 0)
+ *ubbest = *ub;
+ mutex_exit(&spa->spa_uberblock_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+}
+
+void
+vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_read(zio, vd, l,
+ zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_load_done, ubbest);
+ }
+ }
+}
+
+/*
+ * Write the uberblock to both labels of all leaves of the specified vdev.
+ * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_root->io_private;
+
+ if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+ atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
+
+ ASSERT(ub->ub_txg == txg);
+
+ for (l = 0; l < VDEV_LABELS; l++)
+ vdev_label_write(zio, vd, l, ub,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_sync_done, NULL);
+
+ dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
+}
+
+static int
+vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
+{
+ uberblock_t *ubbuf;
+ size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
+ uint64_t *good_writes;
+ zio_t *zio;
+ int error;
+
+ ubbuf = zio_buf_alloc(size);
+ bzero(ubbuf, size);
+ *ubbuf = *ub;
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+ zio = zio_root(spa, NULL, good_writes,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ vdev_uberblock_sync(zio, ubbuf, vd, txg);
+
+ error = zio_wait(zio);
+
+ if (error && *good_writes != 0) {
+ dprintf("partial success: good_writes = %llu\n", *good_writes);
+ error = 0;
+ }
+
+ /*
+ * It's possible to have no good writes and no error if every vdev is in
+ * the CANT_OPEN state.
+ */
+ if (*good_writes == 0 && error == 0)
+ error = EIO;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+ zio_buf_free(ubbuf, size);
+
+ return (error);
+}
+
+/*
+ * Sync out an individual vdev.
+ */
+static void
+vdev_sync_label_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_root->io_private;
+
+ if (zio->io_error == 0)
+ atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+{
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ char *buf;
+ size_t buflen;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_sync_label(zio, vd->vdev_child[c], l, txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ /*
+ * Generate a label describing the top-level config to which we belong.
+ */
+ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+ vdev_sync_label_done, NULL);
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ nvlist_free(label);
+
+ dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
+}
+
+static int
+vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
+{
+ uint64_t *good_writes;
+ zio_t *zio;
+ int error;
+
+ ASSERT(vd == vd->vdev_top);
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+ zio = zio_root(vd->vdev_spa, NULL, good_writes,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ /*
+ * Recursively kick off writes to all labels.
+ */
+ vdev_sync_label(zio, vd, l, txg);
+
+ error = zio_wait(zio);
+
+ if (error && *good_writes != 0) {
+ dprintf("partial success: good_writes = %llu\n", *good_writes);
+ error = 0;
+ }
+
+ if (*good_writes == 0 && error == 0)
+ error = ENODEV;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+
+ return (error);
+}
+
+/*
+ * Sync the entire vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent. The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+vdev_config_sync(vdev_t *uvd, uint64_t txg)
+{
+ spa_t *spa = uvd->vdev_spa;
+ uberblock_t *ub = &spa->spa_uberblock;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ zio_t *zio;
+ int l, error;
+
+ ASSERT(ub->ub_txg <= txg);
+
+ /*
+ * If this isn't a resync due to I/O errors, and nothing changed
+ * in this transaction group, and the vdev configuration hasn't changed,
+ * then there's nothing to do.
+ */
+ if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
+ list_is_empty(&spa->spa_dirty_list)) {
+ dprintf("nothing to sync in %s in txg %llu\n",
+ spa_name(spa), txg);
+ return (0);
+ }
+
+ if (txg > spa_freeze_txg(spa))
+ return (0);
+
+ ASSERT(txg <= spa->spa_final_txg);
+
+ dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
+
+ /*
+ * Flush the write cache of every disk that's been written to
+ * in this transaction group. This ensures that all blocks
+ * written in this txg will be committed to stable storage
+ * before any uberblock that references them.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * Sync out the even labels (L0, L2) for every dirty vdev. If the
+ * system dies in the middle of this process, that's OK: all of the
+ * even labels that made it to disk will be newer than any uberblock,
+ * and will therefore be considered invalid. The odd labels (L1, L3),
+ * which have not yet been touched, will still be valid.
+ */
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (l & 1)
+ continue;
+ if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+ return (error);
+ }
+ }
+
+ /*
+ * Flush the new labels to disk. This ensures that all even-label
+ * updates are committed to stable storage before the uberblock update.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * Sync the uberblocks to all vdevs in the tree specified by uvd.
+ * If the system dies in the middle of this step, there are two cases
+ * to consider, and the on-disk state is consistent either way:
+ *
+ * (1) If none of the new uberblocks made it to disk, then the
+ * previous uberblock will be the newest, and the odd labels
+ * (which had not yet been touched) will be valid with respect
+ * to that uberblock.
+ *
+ * (2) If one or more new uberblocks made it to disk, then they
+ * will be the newest, and the even labels (which had all
+ * been successfully committed) will be valid with respect
+ * to the new uberblocks.
+ */
+ if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+ return (error);
+
+ /*
+ * Flush the uberblocks to disk. This ensures that the odd labels
+ * are no longer needed (because the new uberblocks and the even
+ * labels are safely on disk), so it is safe to overwrite them.
+ */
+ (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+
+ /*
+ * Sync out odd labels for every dirty vdev. If the system dies
+ * in the middle of this process, the even labels and the new
+ * uberblocks will suffice to open the pool. The next time
+ * the pool is opened, the first thing we'll do -- before any
+ * user data is modified -- is mark every vdev dirty so that
+ * all labels will be brought up to date.
+ */
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if ((l & 1) == 0)
+ continue;
+ if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+ return (error);
+ }
+ }
+
+ /*
+ * Flush the new labels to disk. This ensures that all odd-label
+ * updates are committed to stable storage before the next
+ * transaction group begins.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
new file mode 100644
index 000000000000..73d1a83d9436
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -0,0 +1,495 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_child {
+ vdev_t *mc_vd;
+ uint64_t mc_offset;
+ int mc_error;
+ short mc_tried;
+ short mc_skipped;
+} mirror_child_t;
+
+typedef struct mirror_map {
+ int mm_children;
+ int mm_replacing;
+ int mm_preferred;
+ int mm_root;
+ mirror_child_t mm_child[1];
+} mirror_map_t;
+
+int vdev_mirror_shift = 21;
+
+static mirror_map_t *
+vdev_mirror_map_alloc(zio_t *zio)
+{
+ mirror_map_t *mm = NULL;
+ mirror_child_t *mc;
+ vdev_t *vd = zio->io_vd;
+ int c, d;
+
+ if (vd == NULL) {
+ dva_t *dva = zio->io_bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+
+ c = BP_GET_NDVAS(zio->io_bp);
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = B_FALSE;
+ mm->mm_preferred = spa_get_random(c);
+ mm->mm_root = B_TRUE;
+
+ /*
+ * Check the other, lower-index DVAs to see if they're on
+ * the same vdev as the child we picked. If they are, use
+ * them since they are likely to have been allocated from
+ * the primary metaslab in use at the time, and hence are
+ * more likely to have locality with single-copy data.
+ */
+ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
+ if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
+ mm->mm_preferred = d;
+ }
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+ mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+ }
+ } else {
+ c = vd->vdev_children;
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+ mm->mm_preferred = mm->mm_replacing ? 0 :
+ (zio->io_offset >> vdev_mirror_shift) % c;
+ mm->mm_root = B_FALSE;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vd->vdev_child[c];
+ mc->mc_offset = zio->io_offset;
+ }
+ }
+
+ zio->io_vsd = mm;
+ return (mm);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ zio->io_vsd = NULL;
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t c;
+ int numerrors = 0;
+ int ret, lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((ret = vdev_open(cvd)) != 0) {
+ lasterror = ret;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ if (numerrors == vd->vdev_children) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ if (zio->io_error == 0) {
+ zio_t *pio = zio->io_parent;
+ mutex_enter(&pio->io_lock);
+ ASSERT3U(zio->io_size, >=, pio->io_size);
+ bcopy(zio->io_data, pio->io_data, pio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_repair_done(zio_t *zio)
+{
+ ASSERT(zio->io_private == zio->io_parent);
+ vdev_mirror_map_free(zio->io_private);
+}
+
+/*
+ * Try to find a child whose DTL doesn't contain the block we want to read.
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ uint64_t txg = zio->io_txg;
+ int i, c;
+
+ ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+
+ /*
+ * Try to find a child whose DTL doesn't contain the block to read.
+ * If a child is known to be completely inaccessible (indicated by
+ * vdev_is_dead() returning B_TRUE), don't even try.
+ */
+ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
+ if (c >= mm->mm_children)
+ c = 0;
+ mc = &mm->mm_child[c];
+ if (mc->mc_tried || mc->mc_skipped)
+ continue;
+ if (vdev_is_dead(mc->mc_vd)) {
+ mc->mc_error = ENXIO;
+ mc->mc_tried = 1; /* don't even try */
+ mc->mc_skipped = 1;
+ continue;
+ }
+ if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+ return (c);
+ mc->mc_error = ESTALE;
+ mc->mc_skipped = 1;
+ }
+
+ /*
+ * Every device is either missing or has this txg in its DTL.
+ * Look for any child we haven't already tried before giving up.
+ */
+ for (c = 0; c < mm->mm_children; c++)
+ if (!mm->mm_child[c].mc_tried)
+ return (c);
+
+ /*
+ * Every child failed. There's no place left to look.
+ */
+ return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+ mirror_map_t *mm;
+ mirror_child_t *mc;
+ int c, children;
+
+ mm = vdev_mirror_map_alloc(zio);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+ /*
+ * For scrubbing reads we need to allocate a read
+ * buffer for each child and issue reads to all
+ * children. If any child succeeds, it will copy its
+ * data into zio->io_data in vdev_mirror_scrub_done.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL,
+ vdev_mirror_scrub_done, mc));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+ /*
+ * For normal reads just pick one child.
+ */
+ c = vdev_mirror_child_select(zio);
+ children = (c >= 0);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * If this is a resilvering I/O to a replacing vdev,
+ * only the last child should be written -- unless the
+ * first child happens to have a DTL entry here as well.
+ * All other writes go to all children.
+ */
+ if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
+ !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
+ zio->io_txg, 1)) {
+ c = mm->mm_children - 1;
+ children = 1;
+ } else {
+ c = 0;
+ children = mm->mm_children;
+ }
+ }
+
+ while (children--) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
+ c++;
+ }
+
+ zio_wait_children_done(zio);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ int c;
+ int good_copies = 0;
+ int unexpected_errors = 0;
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_tried && mc->mc_error == 0) {
+ good_copies++;
+ continue;
+ }
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (mc->mc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = mc->mc_error;
+ if (!mc->mc_skipped)
+ unexpected_errors++;
+ zio->io_numerrors++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as success.
+ * XXX -- For a replacing vdev, we need to make sure the
+ * new child succeeds.
+ */
+ /* XXPOLICY */
+ if (good_copies != 0)
+ zio->io_error = 0;
+ vdev_mirror_map_free(zio);
+ zio_next_stage(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If we don't have a good copy yet, keep trying other children.
+ */
+ /* XXPOLICY */
+ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+ ASSERT(c >= 0 && c < mm->mm_children);
+ mc = &mm->mm_child[c];
+ dprintf("retrying i/o (err=%d) on child %s\n",
+ zio->io_error, vdev_description(mc->mc_vd));
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_mirror_child_done, mc));
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ /* XXPOLICY */
+ if (good_copies)
+ zio->io_error = 0;
+ else
+ ASSERT(zio->io_error != 0);
+
+ if (good_copies && (spa_mode & FWRITE) &&
+ (unexpected_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER) ||
+ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
+ zio_t *rio;
+
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ *
+ * We issue all repair I/Os as children of 'rio' to arrange
+ * that vdev_mirror_map_free(zio) will be invoked after all
+ * repairs complete, but before we advance to the next stage.
+ */
+ rio = zio_null(zio, zio->io_spa,
+ vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
+
+ for (c = 0; c < mm->mm_children; c++) {
+ /*
+ * Don't rewrite known good children.
+ * Not only is it unnecessary, it could
+ * actually be harmful: if the system lost
+ * power while rewriting the only good copy,
+ * there would be no good copies left!
+ */
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error == 0) {
+ if (mc->mc_tried)
+ continue;
+ if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+ zio->io_txg, 1))
+ continue;
+ mc->mc_error = ESTALE;
+ }
+
+ dprintf("resilvered %s @ 0x%llx error %d\n",
+ vdev_description(mc->mc_vd), mc->mc_offset,
+ mc->mc_error);
+
+ zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
+ mc->mc_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+ }
+
+ zio_nowait(rio);
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ vdev_mirror_map_free(zio);
+ zio_next_stage(zio);
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted == vd->vdev_children)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_mirror_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_MIRROR, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_REPLACING, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_spare_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_SPARE, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
new file mode 100644
index 000000000000..b35f4a5bcd03
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import. It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing. We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ /*
+ * Really this should just fail. But then the root vdev will be in the
+ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
+ * will fail the GUID sum check before ever trying to open the pool.
+ */
+ *psize = SPA_MINDEVSIZE;
+ *ashift = SPA_MINBLOCKSHIFT;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+ zio->io_error = ENOTSUP;
+ zio_next_stage_async(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_missing_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ VDEV_TYPE_MISSING, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 000000000000..7e99c1fd5b82
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * zfs_vdev_max_pending is the maximum number of i/os concurrently
+ * pending to each device. zfs_vdev_min_pending is the initial number
+ * of i/os pending to each device (before it starts ramping up to
+ * max_pending).
+ */
+int zfs_vdev_max_pending = 35;
+int zfs_vdev_min_pending = 4;
+
+/* deadline = pri + (lbolt >> time_shift) */
+int zfs_vdev_time_shift = 6;
+
+/* exponential I/O issue ramp-up rate */
+int zfs_vdev_ramp_rate = 2;
+
+/*
+ * i/os will be aggregated into a single large i/o up to
+ * zfs_vdev_aggregation_limit bytes long.
+ */
+int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_deadline < z2->io_deadline)
+ return (-1);
+ if (z1->io_deadline > z2->io_deadline)
+ return (1);
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+ sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+ avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ avl_destroy(&vq->vq_deadline_tree);
+ avl_destroy(&vq->vq_read_tree);
+ avl_destroy(&vq->vq_write_tree);
+ avl_destroy(&vq->vq_pending_tree);
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_add(&vq->vq_deadline_tree, zio);
+ avl_add(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_remove(&vq->vq_deadline_tree, zio);
+ avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ zio_t *dio;
+ uint64_t offset = 0;
+
+ while ((dio = aio->io_delegate_list) != NULL) {
+ if (aio->io_type == ZIO_TYPE_READ)
+ bcopy((char *)aio->io_data + offset, dio->io_data,
+ dio->io_size);
+ offset += dio->io_size;
+ aio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = aio->io_error;
+ zio_next_stage(dio);
+ }
+ ASSERT3U(offset, ==, aio->io_size);
+
+ zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define IS_ADJACENT(io, nio) \
+ ((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+typedef void zio_issue_func_t(zio_t *);
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
+ zio_issue_func_t **funcp)
+{
+ zio_t *fio, *lio, *aio, *dio;
+ avl_tree_t *tree;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ *funcp = NULL;
+
+ if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+ avl_numnodes(&vq->vq_deadline_tree) == 0)
+ return (NULL);
+
+ fio = lio = avl_first(&vq->vq_deadline_tree);
+
+ tree = fio->io_vdev_tree;
+ size = fio->io_size;
+
+ while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ dio->io_delegate_next = fio;
+ fio = dio;
+ size += dio->io_size;
+ }
+
+ while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ lio->io_delegate_next = dio;
+ lio = dio;
+ size += dio->io_size;
+ }
+
+ if (fio != lio) {
+ char *buf = zio_buf_alloc(size);
+ uint64_t offset = 0;
+ int nagg = 0;
+
+ ASSERT(size <= zfs_vdev_aggregation_limit);
+
+ aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+ fio->io_offset, buf, size, fio->io_type,
+ ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_NOBOOKMARK,
+ vdev_queue_agg_io_done, NULL);
+
+ aio->io_delegate_list = fio;
+
+ for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+ ASSERT(dio->io_type == aio->io_type);
+ ASSERT(dio->io_vdev_tree == tree);
+ if (dio->io_type == ZIO_TYPE_WRITE)
+ bcopy(dio->io_data, buf + offset, dio->io_size);
+ offset += dio->io_size;
+ vdev_queue_io_remove(vq, dio);
+ zio_vdev_io_bypass(dio);
+ nagg++;
+ }
+
+ ASSERT(offset == size);
+
+ dprintf("%5s T=%llu off=%8llx agg=%3d "
+ "old=%5llx new=%5llx\n",
+ zio_type_name[fio->io_type],
+ fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+ avl_add(&vq->vq_pending_tree, aio);
+
+ *funcp = zio_nowait;
+ return (aio);
+ }
+
+ ASSERT(fio->io_vdev_tree == tree);
+ vdev_queue_io_remove(vq, fio);
+
+ avl_add(&vq->vq_pending_tree, fio);
+
+ *funcp = zio_next_stage;
+
+ return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vdev_tree = &vq->vq_read_tree;
+ else
+ zio->io_vdev_tree = &vq->vq_write_tree;
+
+ mutex_enter(&vq->vq_lock);
+
+ zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
+ zio->io_priority;
+
+ vdev_queue_io_add(vq, zio);
+
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL || func != zio_nowait)
+ return (nio);
+
+ func(nio);
+ return (NULL);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+ int i;
+
+ mutex_enter(&vq->vq_lock);
+
+ avl_remove(&vq->vq_pending_tree, zio);
+
+ for (i = 0; i < zfs_vdev_ramp_rate; i++) {
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+ if (nio == NULL)
+ break;
+ mutex_exit(&vq->vq_lock);
+ if (func == zio_next_stage)
+ zio_vdev_io_reissue(nio);
+ func(nio);
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
new file mode 100644
index 000000000000..08df7e09ba08
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -0,0 +1,1223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+
+/*
+ * Virtual device vector for RAID-Z.
+ *
+ * This vdev supports both single and double parity. For single parity, we
+ * use a simple XOR of all the data columns. For double parity, we use both
+ * the simple XOR as well as a technique described in "The mathematics of
+ * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
+ * over the integers expressable in a single byte. Briefly, the operations on
+ * the field are defined as follows:
+ *
+ * o addition (+) is represented by a bitwise XOR
+ * o subtraction (-) is therefore identical to addition: A + B = A - B
+ * o multiplication of A by 2 is defined by the following bitwise expression:
+ * (A * 2)_7 = A_6
+ * (A * 2)_6 = A_5
+ * (A * 2)_5 = A_4
+ * (A * 2)_4 = A_3 + A_7
+ * (A * 2)_3 = A_2 + A_7
+ * (A * 2)_2 = A_1 + A_7
+ * (A * 2)_1 = A_0
+ * (A * 2)_0 = A_7
+ *
+ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ *
+ * Observe that any number in the field (except for 0) can be expressed as a
+ * power of 2 -- a generator for the field. We store a table of the powers of
+ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
+ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
+ * than field addition). The inverse of a field element A (A^-1) is A^254.
+ *
+ * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
+ * can be expressed by field operations:
+ *
+ * P = D_0 + D_1 + ... + D_n-2 + D_n-1
+ * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
+ * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *
+ * See the reconstruction code below for how P and Q can used individually or
+ * in concert to recover missing data columns.
+ */
+
+typedef struct raidz_col {
+ uint64_t rc_devidx; /* child device index for I/O */
+ uint64_t rc_offset; /* device offset */
+ uint64_t rc_size; /* I/O size */
+ void *rc_data; /* I/O data */
+ int rc_error; /* I/O error for this device */
+ uint8_t rc_tried; /* Did we attempt this I/O column? */
+ uint8_t rc_skipped; /* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+ uint64_t rm_cols; /* Column count */
+ uint64_t rm_bigcols; /* Number of oversized columns */
+ uint64_t rm_asize; /* Actual total I/O size */
+ uint64_t rm_missingdata; /* Count of missing data devices */
+ uint64_t rm_missingparity; /* Count of missing parity devices */
+ uint64_t rm_firstdatacol; /* First data column/parity count */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
+#define VDEV_RAIDZ_P 0
+#define VDEV_RAIDZ_Q 1
+
+#define VDEV_RAIDZ_MAXPARITY 2
+
+#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+
+/*
+ * These two tables represent powers and logs of 2 in the Galois field defined
+ * above. These values were computed by repeatedly multiplying by 2 as above.
+ */
+static const uint8_t vdev_raidz_pow2[256] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+static const uint8_t vdev_raidz_log2[256] = {
+ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
+
+/*
+ * Multiply a given number by 2 raised to the given power.
+ */
+static uint8_t
+vdev_raidz_exp2(uint_t a, int exp)
+{
+ if (a == 0)
+ return (0);
+
+ ASSERT(exp >= 0);
+ ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
+
+ exp += vdev_raidz_log2[a];
+ if (exp > 255)
+ exp -= 255;
+
+ return (vdev_raidz_pow2[exp]);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+ uint64_t nparity)
+{
+ raidz_map_t *rm;
+ uint64_t b = zio->io_offset >> unit_shift;
+ uint64_t s = zio->io_size >> unit_shift;
+ uint64_t f = b % dcols;
+ uint64_t o = (b / dcols) << unit_shift;
+ uint64_t q, r, c, bc, col, acols, coff, devidx;
+
+ q = s / (dcols - nparity);
+ r = s - q * (dcols - nparity);
+ bc = (r == 0 ? 0 : r + nparity);
+
+ acols = (q == 0 ? bc : dcols);
+
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+
+ rm->rm_cols = acols;
+ rm->rm_bigcols = bc;
+ rm->rm_asize = 0;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+ rm->rm_firstdatacol = nparity;
+
+ for (c = 0; c < acols; c++) {
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << unit_shift;
+ }
+ rm->rm_col[c].rc_devidx = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
+ rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+ rm->rm_asize += rm->rm_col[c].rc_size;
+ }
+
+ rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+
+ rm->rm_col[c].rc_data = zio->io_data;
+
+ for (c = c + 1; c < acols; c++)
+ rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ */
+ ASSERT(rm->rm_cols >= 2);
+ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+ if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ devidx = rm->rm_col[0].rc_devidx;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_devidx = devidx;
+ rm->rm_col[1].rc_offset = o;
+ }
+
+ zio->io_vsd = rm;
+ return (rm);
+}
+
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ int c;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ zio->io_vsd = NULL;
+}
+
+static void
+vdev_raidz_generate_parity_p(raidz_map_t *rm)
+{
+ uint64_t *p, *src, pcount, ccount, i;
+ int c;
+
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount);
+ for (i = 0; i < ccount; i++, p++, src++) {
+ *p = *src;
+ }
+ } else {
+ ASSERT(ccount <= pcount);
+ for (i = 0; i < ccount; i++, p++, src++) {
+ *p ^= *src;
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+{
+ uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ int c;
+
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount || ccount == 0);
+ for (i = 0; i < ccount; i++, p++, q++, src++) {
+ *q = *src;
+ *p = *src;
+ }
+ for (; i < pcount; i++, p++, q++, src++) {
+ *q = 0;
+ *p = 0;
+ }
+ } else {
+ ASSERT(ccount <= pcount);
+
+ /*
+ * Rather than multiplying each byte individually (as
+ * described above), we are able to handle 8 at once
+ * by generating a mask based on the high bit in each
+ * byte and using that to conditionally XOR in 0x1d.
+ */
+ for (i = 0; i < ccount; i++, p++, q++, src++) {
+ mask = *q & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ *q ^= *src;
+ *p ^= *src;
+ }
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ */
+ for (; i < pcount; i++, q++) {
+ mask = *q & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, xcount, ccount, count, i;
+ int c;
+
+ xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
+ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
+ ASSERT(xcount > 0);
+
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ for (i = 0; i < xcount; i++, dst++, src++) {
+ *dst = *src;
+ }
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+
+ if (c == x)
+ continue;
+
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ count = MIN(ccount, xcount);
+
+ for (i = 0; i < count; i++, dst++, src++) {
+ *dst ^= *src;
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, xcount, ccount, count, mask, i;
+ uint8_t *b;
+ int c, j, exp;
+
+ xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
+ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+
+ if (c == x)
+ ccount = 0;
+ else
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ count = MIN(ccount, xcount);
+
+ if (c == rm->rm_firstdatacol) {
+ for (i = 0; i < count; i++, dst++, src++) {
+ *dst = *src;
+ }
+ for (; i < xcount; i++, dst++) {
+ *dst = 0;
+ }
+
+ } else {
+ /*
+ * For an explanation of this, see the comment in
+ * vdev_raidz_generate_parity_pq() above.
+ */
+ for (i = 0; i < count; i++, dst++, src++) {
+ mask = *dst & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ *dst ^= *src;
+ }
+
+ for (; i < xcount; i++, dst++) {
+ mask = *dst & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ }
+ }
+ }
+
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ exp = 255 - (rm->rm_cols - 1 - x);
+
+ for (i = 0; i < xcount; i++, dst++, src++) {
+ *dst ^= *src;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, exp);
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+{
+ uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
+ void *pdata, *qdata;
+ uint64_t xsize, ysize, i;
+
+ ASSERT(x < y);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(y < rm->rm_cols);
+
+ ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+
+ /*
+ * Move the parity data aside -- we're going to compute parity as
+ * though columns x and y were full of zeros -- Pxy and Qxy. We want to
+ * reuse the parity generation mechanism without trashing the actual
+ * parity so we make those columns appear to be full of zeros by
+ * setting their lengths to zero.
+ */
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ xsize = rm->rm_col[x].rc_size;
+ ysize = rm->rm_col[y].rc_size;
+
+ rm->rm_col[VDEV_RAIDZ_P].rc_data =
+ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_data =
+ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ rm->rm_col[x].rc_size = 0;
+ rm->rm_col[y].rc_size = 0;
+
+ vdev_raidz_generate_parity_pq(rm);
+
+ rm->rm_col[x].rc_size = xsize;
+ rm->rm_col[y].rc_size = ysize;
+
+ p = pdata;
+ q = qdata;
+ pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ xd = rm->rm_col[x].rc_data;
+ yd = rm->rm_col[y].rc_data;
+
+ /*
+ * We now have:
+ * Pxy = P + D_x + D_y
+ * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
+ *
+ * We can then solve for D_x:
+ * D_x = A * (P + Pxy) + B * (Q + Qxy)
+ * where
+ * A = 2^(x - y) * (2^(x - y) + 1)^-1
+ * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
+ *
+ * With D_x in hand, we can easily solve for D_y:
+ * D_y = P + Pxy + D_x
+ */
+
+ a = vdev_raidz_pow2[255 + x - y];
+ b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ tmp = 255 - vdev_raidz_log2[a ^ 1];
+
+ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
+ bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+
+ for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
+ vdev_raidz_exp2(*q ^ *qxy, bexp);
+
+ if (i < ysize)
+ *yd = *p ^ *pxy ^ *xd;
+ }
+
+ zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
+ rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ /*
+ * Restore the saved parity data.
+ */
+ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+}
+
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t nparity = vd->vdev_nparity;
+ int c, error;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ ASSERT(nparity > 0);
+
+ if (nparity > VDEV_RAIDZ_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ *asize *= vd->vdev_children;
+
+ if (numerrors > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t cols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+
+ asize = ((psize - 1) >> ashift) + 1;
+ asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+ asize = roundup(asize, nparity + 1) << ashift;
+
+ return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_error = zio->io_error;
+ rc->rc_tried = 1;
+ rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_repair_done(zio_t *zio)
+{
+ ASSERT(zio->io_private == zio->io_parent);
+ vdev_raidz_map_free(zio->io_private);
+}
+
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ blkptr_t *bp = zio->io_bp;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c;
+
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * Generate RAID parity in the first virtual columns.
+ */
+ if (rm->rm_firstdatacol == 1)
+ vdev_raidz_generate_parity_p(rm);
+ else
+ vdev_raidz_generate_parity_pq(rm);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last -- any errors along the way will force us to read the parity
+ * data.
+ */
+ for (c = rm->rm_cols - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ if (vdev_is_dead(cvd)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = ENXIO;
+ rc->rc_tried = 1; /* don't even try */
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = ESTALE;
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ (zio->io_flags & ZIO_FLAG_SCRUB)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ }
+
+ zio_wait_children_done(zio);
+}
+
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+{
+ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+ dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+ vdev_description(vd));
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number such failures.
+ */
+static int
+raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+{
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int c, ret = 0;
+ raidz_col_t *rc;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ orig[c] = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig[c], rc->rc_size);
+ }
+
+ if (rm->rm_firstdatacol == 1)
+ vdev_raidz_generate_parity_p(rm);
+ else
+ vdev_raidz_generate_parity_pq(rm);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ ret++;
+ }
+ zio_buf_free(orig[c], rc->rc_size);
+ }
+
+ return (ret);
+}
+
+static uint64_t raidz_corrected_p;
+static uint64_t raidz_corrected_q;
+static uint64_t raidz_corrected_pq;
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc, *rc1;
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+ int n, c, c1;
+
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
+ ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (rc->rc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = rc->rc_error;
+
+ if (c < rm->rm_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+
+ zio->io_numerrors++;
+ } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * If this is not a failfast write, and we were able to
+ * write enough columns to reconstruct the data, good enough.
+ */
+ /* XXPOLICY */
+ if (zio->io_numerrors <= rm->rm_firstdatacol &&
+ !(zio->io_flags & ZIO_FLAG_FAILFAST))
+ zio->io_error = 0;
+
+ vdev_raidz_map_free(zio);
+ zio_next_stage(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ /*
+ * There are three potential phases for a read:
+ * 1. produce valid data from the columns read
+ * 2. read all disks and try again
+ * 3. perform combinatorial reconstruction
+ *
+ * Each phase is progressively both more expensive and less likely to
+ * occur. If we encounter more errors than we can repair or all phases
+ * fail, we have no choice but to return an error.
+ */
+
+ /*
+ * If the number of errors we saw was correctable -- less than or equal
+ * to the number of parity disks read -- attempt to produce data that
+ * has a valid checksum. Naturally, this case applies in the absence of
+ * any errors.
+ */
+ if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
+ switch (data_errors) {
+ case 0:
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ if (parity_errors + parity_untried <
+ rm->rm_firstdatacol) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+ goto done;
+ }
+ break;
+
+ case 1:
+ /*
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
+ */
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rm->rm_firstdatacol);
+
+ /*
+ * Find the column that reported the error.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ vdev_raidz_reconstruct_p(rm, c);
+ } else {
+ ASSERT(rm->rm_firstdatacol > 1);
+ vdev_raidz_reconstruct_q(rm, c);
+ }
+
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
+ atomic_inc_64(&raidz_corrected_p);
+ else
+ atomic_inc_64(&raidz_corrected_q);
+
+ /*
+ * If there's more than one parity disk that
+ * was successfully read, confirm that the
+ * other parity disk produced the correct data.
+ * This routine is suboptimal in that it
+ * regenerates both the parity we wish to test
+ * as well as the parity we just used to
+ * perform the reconstruction, but this should
+ * be a relatively uncommon case, and can be
+ * optimized if it becomes a problem.
+ */
+ if (parity_errors < rm->rm_firstdatacol - 1) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+
+ goto done;
+ }
+ break;
+
+ case 2:
+ /*
+ * Two data column errors require double parity.
+ */
+ ASSERT(rm->rm_firstdatacol == 2);
+
+ /*
+ * Find the two columns that reported errors.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ for (c1 = c++; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ vdev_raidz_reconstruct_pq(rm, c1, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_pq);
+
+ goto done;
+ }
+ break;
+
+ default:
+ ASSERT(rm->rm_firstdatacol <= 2);
+ ASSERT(0);
+ }
+ }
+
+ /*
+ * This isn't a typical situation -- either we got a read error or
+ * a child silently returned bad data. Read every block so we can
+ * try again with as much data and parity as we can track down. If
+ * we've already been through once before, all children will be marked
+ * as tried so we'll proceed to combinatorial reconstruction.
+ */
+ unexpected_errors = 1;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ if (rm->rm_col[c].rc_tried)
+ continue;
+
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ do {
+ rc = &rm->rm_col[c];
+ if (rc->rc_tried)
+ continue;
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ } while (++c < rm->rm_cols);
+ dprintf("rereading\n");
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ /*
+ * At this point we've attempted to reconstruct the data given the
+ * errors we detected, and we've attempted to read all columns. There
+ * must, therefore, be one or more additional problems -- silent errors
+ * resulting in invalid data rather than explicit I/O errors resulting
+ * in absent data. Before we attempt combinatorial reconstruction make
+ * sure we have a chance of coming up with the right answer.
+ */
+ if (zio->io_numerrors >= rm->rm_firstdatacol) {
+ ASSERT(zio->io_error != 0);
+ goto done;
+ }
+
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from parity P.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct_p(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_p);
+
+ /*
+ * If this child didn't know that it returned
+ * bad data, inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from parity Q.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct_q(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_q);
+
+ /*
+ * If this child didn't know that it returned
+ * bad data, inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ if (rm->rm_firstdatacol > 1 &&
+ rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
+ rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from both P and Q.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
+ void *orig, *orig1;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+
+ for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
+ rc1 = &rm->rm_col[c1];
+
+ orig1 = zio_buf_alloc(rc1->rc_size);
+ bcopy(rc1->rc_data, orig1, rc1->rc_size);
+
+ vdev_raidz_reconstruct_pq(rm, c, c1);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio_buf_free(orig1, rc1->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_pq);
+
+ /*
+ * If these children didn't know they
+ * returned bad data, inform them.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ if (rc1->rc_tried && rc1->rc_error == 0)
+ raidz_checksum_error(zio, rc1);
+
+ rc->rc_error = ECKSUM;
+ rc1->rc_error = ECKSUM;
+
+ goto done;
+ }
+
+ bcopy(orig1, rc1->rc_data, rc1->rc_size);
+ zio_buf_free(orig1, rc1->rc_size);
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ /*
+ * All combinations failed to checksum. Generate checksum ereports for
+ * all children.
+ */
+ zio->io_error = ECKSUM;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
+ rc->rc_offset, rc->rc_size);
+ }
+ }
+
+done:
+ zio_checksum_verified(zio);
+
+ if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ zio_t *rio;
+
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ *
+ * We issue all repair I/Os as children of 'rio' to arrange
+ * that vdev_raidz_map_free(zio) will be invoked after all
+ * repairs complete, but before we advance to the next stage.
+ */
+ rio = zio_null(zio, zio->io_spa,
+ vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error == 0)
+ continue;
+
+ dprintf("%s resilvered %s @ 0x%llx error %d\n",
+ vdev_description(vd),
+ vdev_description(cvd),
+ zio->io_offset, rc->rc_error);
+
+ zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_CANFAIL, NULL, NULL));
+ }
+
+ zio_nowait(rio);
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ vdev_raidz_map_free(zio);
+ zio_next_stage(zio);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted > vd->vdev_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+ vdev_raidz_open,
+ vdev_raidz_close,
+ vdev_raidz_asize,
+ vdev_raidz_io_start,
+ vdev_raidz_io_done,
+ vdev_raidz_state_change,
+ VDEV_TYPE_RAIDZ, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
new file mode 100644
index 000000000000..0e8752c6ce83
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata. Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
+ * place to live. When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine. Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+/*ARGSUSED*/
+static int
+too_many_errors(vdev_t *vd, int numerrors)
+{
+ return (numerrors > 0);
+}
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ int c, error;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+ }
+
+ if (too_many_errors(vd, numerrors)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ *asize = 0;
+ *ashift = 0;
+
+ return (0);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (too_many_errors(vd, faulted))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_root_ops = {
+ vdev_root_open,
+ vdev_root_close,
+ vdev_default_asize,
+ NULL, /* io_start - not applicable to the root */
+ NULL, /* io_done - not applicable to the root */
+ vdev_root_state_change,
+ VDEV_TYPE_ROOT, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c
new file mode 100644
index 000000000000..533431f13314
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -0,0 +1,1070 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)vbuf;
+
+ if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+ zap_leaf_byteswap(vbuf, size);
+ else {
+ /* it's a ptrtbl block */
+ byteswap_uint64_array(vbuf, size);
+ }
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int i;
+ zap_phys_t *zp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ zap->zap_ismicro = FALSE;
+
+ (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
+ &zap->zap_f.zap_phys, zap_evict);
+
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
+ zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
+
+ zp = zap->zap_f.zap_phys;
+ /*
+ * explicitly zero it since it might be coming from an
+ * initialized microzap
+ */
+ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+ zp->zap_block_type = ZBT_HEADER;
+ zp->zap_magic = ZAP_MAGIC;
+
+ zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+ zp->zap_freeblk = 2; /* block 1 will be the first leaf */
+ zp->zap_num_leafs = 1;
+ zp->zap_num_entries = 0;
+ zp->zap_salt = zap->zap_salt;
+
+ /* block 1 will be the first leaf */
+ for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+ /*
+ * set up block 1 - the first leaf
+ */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_dbuf = db;
+ l->l_phys = db->db_data;
+
+ zap_leaf_init(l);
+
+ kmem_free(l, sizeof (zap_leaf_t));
+ dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+ if (RW_WRITE_HELD(&zap->zap_rwlock))
+ return (1);
+ if (rw_tryupgrade(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+ dmu_tx_t *tx)
+{
+ uint64_t b, newblk;
+ dmu_buf_t *db_old, *db_new;
+ int err;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int hepb = 1<<(bs-4);
+ /* hepb = half the number of entries in a block */
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+ ASSERT(tbl->zt_numblks > 0);
+
+ if (tbl->zt_nextblk != 0) {
+ newblk = tbl->zt_nextblk;
+ } else {
+ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+ tbl->zt_nextblk = newblk;
+ ASSERT3U(tbl->zt_blks_copied, ==, 0);
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs);
+ }
+
+ /*
+ * Copy the ptrtbl from the old to new location.
+ */
+
+ b = tbl->zt_blks_copied;
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << bs, FTAG, &db_old);
+ if (err)
+ return (err);
+
+ /* first half of entries in old[b] go to new[2*b+0] */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << bs, FTAG, &db_new));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func(db_old->db_data, db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ /* second half of entries in old[b] go to new[2*b+1] */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << bs, FTAG, &db_new));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func((uint64_t *)db_old->db_data + hepb,
+ db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ dmu_buf_rele(db_old, FTAG);
+
+ tbl->zt_blks_copied++;
+
+ dprintf("copied block %llu of %llu\n",
+ tbl->zt_blks_copied, tbl->zt_numblks);
+
+ if (tbl->zt_blks_copied == tbl->zt_numblks) {
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+ tbl->zt_blk = newblk;
+ tbl->zt_numblks *= 2;
+ tbl->zt_shift++;
+ tbl->zt_nextblk = 0;
+ tbl->zt_blks_copied = 0;
+
+ dprintf("finished; numblocks now %llu (%lluk entries)\n",
+ tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+ }
+
+ return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+ dmu_tx_t *tx)
+{
+ int err;
+ uint64_t blk, off;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+
+ dprintf("storing %llx at index %llx\n", val, idx);
+
+ blk = idx >> (bs-3);
+ off = idx & ((1<<(bs-3))-1);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ dmu_buf_will_dirty(db, tx);
+
+ if (tbl->zt_nextblk != 0) {
+ uint64_t idx2 = idx * 2;
+ uint64_t blk2 = idx2 >> (bs-3);
+ uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+ dmu_buf_t *db2;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+ dmu_buf_will_dirty(db2, tx);
+ ((uint64_t *)db2->db_data)[off2] = val;
+ ((uint64_t *)db2->db_data)[off2+1] = val;
+ dmu_buf_rele(db2, FTAG);
+ }
+
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+ uint64_t blk, off;
+ int err;
+ dmu_buf_t *db;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ blk = idx >> (bs-3);
+ off = idx & ((1<<(bs-3))-1);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ *valp = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db, FTAG);
+
+ if (tbl->zt_nextblk != 0) {
+ /*
+ * read the nextblk for the sake of i/o error checking,
+ * so that zap_table_load() will catch errors for
+ * zap_table_store.
+ */
+ blk = (idx*2) >> (bs-3);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+ dmu_buf_rele(db, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+ int i;
+ for (i = 0; i < n; i++) {
+ uint64_t lb = src[i];
+ dst[2*i+0] = lb;
+ dst[2*i+1] = lb;
+ }
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+ /* In case things go horribly wrong. */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+ return (ENOSPC);
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /*
+ * We are outgrowing the "embedded" ptrtbl (the one
+ * stored in the header block). Give it its own entire
+ * block, which will double the size of the ptrtbl.
+ */
+ uint64_t newblk;
+ dmu_buf_t *db_new;
+ int err;
+
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+
+ newblk = zap_allocate_blocks(zap, 1);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+ if (err)
+ return (err);
+ dmu_buf_will_dirty(db_new, tx);
+ zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ dmu_buf_rele(db_new, FTAG);
+
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+
+ ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+ (FZAP_BLOCK_SHIFT(zap)-3));
+
+ return (0);
+ } else {
+ return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ zap_ptrtbl_transfer, tx));
+ }
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+ ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
+ zap->zap_f.zap_phys->zap_num_entries += delta;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+ uint64_t newblk;
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ newblk = zap->zap_f.zap_phys->zap_freeblk;
+ zap->zap_f.zap_phys->zap_freeblk += nblocks;
+ return (newblk);
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+ void *winner;
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = zap_allocate_blocks(zap, 1);
+ l->l_dbuf = NULL;
+ l->l_phys = NULL;
+
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
+ winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+ ASSERT(winner == NULL);
+ dmu_buf_will_dirty(l->l_dbuf, tx);
+
+ zap_leaf_init(l);
+
+ zap->zap_f.zap_phys->zap_num_leafs++;
+
+ return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+ ASSERT(!zap->zap_ismicro);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+ *count = zap->zap_f.zap_phys->zap_num_entries;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+ return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+_NOTE(ARGSUSED(0))
+static void
+zap_leaf_pageout(dmu_buf_t *db, void *vl)
+{
+ zap_leaf_t *l = vl;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+ zap_leaf_t *l, *winner;
+
+ ASSERT(blkid != 0);
+
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = blkid;
+ l->l_bs = highbit(db->db_size)-1;
+ l->l_dbuf = db;
+ l->l_phys = NULL;
+
+ winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+
+ rw_exit(&l->l_rwlock);
+ if (winner != NULL) {
+ /* someone else set it first */
+ zap_leaf_pageout(NULL, l);
+ l = winner;
+ }
+
+ /*
+ * lhr_pad was previously used for the next leaf in the leaf
+ * chain. There should be no chained leafs (as we have removed
+ * support for them).
+ */
+ ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
+
+ /*
+ * There should be more hash entries than there can be
+ * chunks to put in the hash table
+ */
+ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+ /* The chunks should begin at the end of the hash table */
+ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
+ &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+ /* The chunks should end at the end of the block */
+ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+ (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+
+ return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ blkid << bs, NULL, &db);
+ if (err)
+ return (err);
+
+ ASSERT3U(db->db_object, ==, zap->zap_object);
+ ASSERT3U(db->db_offset, ==, blkid << bs);
+ ASSERT3U(db->db_size, ==, 1 << bs);
+ ASSERT(blkid != 0);
+
+ l = dmu_buf_get_user(db);
+
+ if (l == NULL)
+ l = zap_open_leaf(blkid, db);
+
+ rw_enter(&l->l_rwlock, lt);
+ /*
+ * Must lock before dirtying, otherwise l->l_phys could change,
+ * causing ASSERT below to fail.
+ */
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
+ ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ *lp = l;
+ return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ ASSERT3U(idx, <,
+ (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+ return (0);
+ } else {
+ return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, valp));
+ }
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+ return (0);
+ } else {
+ return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, blk, tx));
+ }
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+ uint64_t idx, blk;
+ int err;
+
+ ASSERT(zap->zap_dbuf == NULL ||
+ zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
+ ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
+ idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ err = zap_idx_to_blk(zap, idx, &blk);
+ if (err != 0)
+ return (err);
+ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+ ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
+ (*lp)->l_phys->l_hdr.lh_prefix);
+ return (err);
+}
+
+static int
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
+ zap_leaf_t **lp)
+{
+ zap_leaf_t *nl;
+ int prefix_diff, i, err;
+ uint64_t sibling;
+ int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+
+ ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ l->l_phys->l_hdr.lh_prefix);
+
+ if (zap_tryupgradedir(zap, tx) == 0 ||
+ old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ /* We failed to upgrade, or need to grow the pointer table */
+ objset_t *os = zap->zap_objset;
+ uint64_t object = zap->zap_object;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
+ if (err)
+ return (err);
+ ASSERT(!zap->zap_ismicro);
+
+ while (old_prefix_len ==
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ err = zap_grow_ptrtbl(zap, tx);
+ if (err)
+ return (err);
+ }
+
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err)
+ return (err);
+
+ if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+ /* it split while our locks were down */
+ *lp = l;
+ return (0);
+ }
+ }
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ l->l_phys->l_hdr.lh_prefix);
+
+ prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ (old_prefix_len + 1);
+ sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+ /* check for i/o errors before doing zap_leaf_split */
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, sibling+i, &blk);
+ if (err)
+ return (err);
+ ASSERT3U(blk, ==, l->l_blkid);
+ }
+
+ nl = zap_create_leaf(zap, tx);
+ zap_leaf_split(l, nl);
+
+ /* set sibling pointers */
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
+ }
+
+ if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+ /* we want the sibling */
+ zap_put_leaf(l);
+ *lp = nl;
+ } else {
+ zap_put_leaf(nl);
+ *lp = l;
+ }
+
+ return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
+ l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+ zap_put_leaf(l);
+
+ if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+ int err;
+
+ /*
+ * We are in the middle of growing the pointer table, or
+ * this leaf will soon make us grow it.
+ */
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ objset_t *os = zap->zap_objset;
+ uint64_t zapobj = zap->zap_object;
+
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, zapobj, tx,
+ RW_WRITER, FALSE, &zap);
+ if (err)
+ return;
+ }
+
+ /* could have finished growing while our locks were down */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+ (void) zap_grow_ptrtbl(zap, tx);
+ }
+}
+
+
+static int
+fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+{
+ if (name && strlen(name) > ZAP_MAXNAMELEN)
+ return (E2BIG);
+
+ /* Only integer sizes supported by C */
+ switch (integer_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if (integer_size * num_integers > ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ return (0);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+int
+fzap_lookup(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_leaf_t *l;
+ int err;
+ uint64_t hash;
+ zap_entry_handle_t zeh;
+
+ err = fzap_checksize(name, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0)
+ err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add_cd(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err;
+ zap_entry_handle_t zeh;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(!zap->zap_ismicro);
+ ASSERT(fzap_checksize(name, integer_size, num_integers) == 0);
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0) {
+ err = EEXIST;
+ goto out;
+ }
+ if (err != ENOENT)
+ goto out;
+
+ err = zap_entry_create(l, name, hash, cd,
+ integer_size, num_integers, val, &zeh);
+
+ if (err == 0) {
+ zap_increment_num_entries(zap, 1, tx);
+ } else if (err == EAGAIN) {
+ err = zap_expand_leaf(zap, l, hash, tx, &l);
+ if (err == 0)
+ goto retry;
+ }
+
+out:
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+ return (err);
+}
+
+int
+fzap_add(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ int err = fzap_checksize(name, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ return (fzap_add_cd(zap, name, integer_size, num_integers,
+ val, ZAP_MAXCD, tx));
+}
+
+int
+fzap_update(zap_t *zap, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err, create;
+ zap_entry_handle_t zeh;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ err = fzap_checksize(name, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ create = (err == ENOENT);
+ ASSERT(err == 0 || err == ENOENT);
+
+ /* XXX If this leaf is chained, split it if we can. */
+
+ if (create) {
+ err = zap_entry_create(l, name, hash, ZAP_MAXCD,
+ integer_size, num_integers, val, &zeh);
+ if (err == 0)
+ zap_increment_num_entries(zap, 1, tx);
+ } else {
+ err = zap_entry_update(&zeh, integer_size, num_integers, val);
+ }
+
+ if (err == EAGAIN) {
+ err = zap_expand_leaf(zap, l, hash, tx, &l);
+ if (err == 0)
+ goto retry;
+ }
+
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+ return (err);
+}
+
+int
+fzap_length(zap_t *zap, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_leaf_t *l;
+ int err;
+ uint64_t hash;
+ zap_entry_handle_t zeh;
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err != 0)
+ goto out;
+
+ if (integer_size)
+ *integer_size = zeh.zeh_integer_size;
+ if (num_integers)
+ *num_integers = zeh.zeh_num_integers;
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err;
+ zap_entry_handle_t zeh;
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0) {
+ zap_entry_remove(&zeh);
+ zap_increment_num_entries(zap, -1, tx);
+ }
+ zap_put_leaf(l);
+ dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
+ zap->zap_objset, zap->zap_object, name, err);
+ return (err);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
+{
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, zapobj);
+ (err = zap_cursor_retrieve(&zc, za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if (za->za_first_integer == value) {
+ (void) strcpy(name, za->za_name);
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err = ENOENT;
+ zap_entry_handle_t zeh;
+ zap_leaf_t *l;
+
+ /* retrieve the next entry at or after zc_hash/zc_cd */
+ /* if no entry, return ENOENT */
+
+ if (zc->zc_leaf &&
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
+ zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+
+again:
+ if (zc->zc_leaf == NULL) {
+ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+ &zc->zc_leaf);
+ if (err != 0)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ }
+ l = zc->zc_leaf;
+
+ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+ if (err == ENOENT) {
+ uint64_t nocare =
+ (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+ zc->zc_cd = 0;
+ if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+ zc->zc_hash = -1ULL;
+ } else {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ goto again;
+ }
+ }
+
+ if (err == 0) {
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+ za->za_integer_length = zeh.zeh_integer_size;
+ za->za_num_integers = zeh.zeh_num_integers;
+ if (zeh.zeh_num_integers == 0) {
+ za->za_first_integer = 0;
+ } else {
+ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+ ASSERT(err == 0 || err == EOVERFLOW);
+ }
+ err = zap_entry_read_name(&zeh,
+ sizeof (za->za_name), za->za_name);
+ ASSERT(err == 0);
+ }
+ rw_exit(&zc->zc_leaf->l_rwlock);
+ return (err);
+}
+
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+ int i, err;
+ uint64_t lastblk = 0;
+
+ /*
+ * NB: if a leaf has more pointers than an entire ptrtbl block
+ * can hold, then it'll be accounted for more than once, since
+ * we won't have lastblk.
+ */
+ for (i = 0; i < len; i++) {
+ zap_leaf_t *l;
+
+ if (tbl[i] == lastblk)
+ continue;
+ lastblk = tbl[i];
+
+ err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ if (err == 0) {
+ zap_leaf_stats(zap, l, zs);
+ zap_put_leaf(l);
+ }
+ }
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ zs->zs_blocksize = 1ULL << bs;
+
+ /*
+ * Set zap_phys_t fields
+ */
+ zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
+ zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
+ zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
+ zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
+ zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
+ zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+
+ /*
+ * Set zap_ptrtbl fields
+ */
+ zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+ zs->zs_ptrtbl_blks_copied =
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
+ zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
+ zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /* the ptrtbl is entirely in the header block. */
+ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+ } else {
+ int b;
+
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+
+ for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ b++) {
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+ FTAG, &db);
+ if (err == 0) {
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(bs-3), zs);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ }
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
new file mode 100644
index 000000000000..5dff5145308a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -0,0 +1,741 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+
+#define CHAIN_END 0xffff /* end of the chunk chain */
+
+/* half the (current) minimum block size */
+#define MAX_ARRAY_BYTES (8<<10)
+
+#define LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+ ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+
+#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+ char *cp = a;
+ char *cpend = cp + n;
+
+ while (cp < cpend)
+ *cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+ switch (len) {
+ case 1:
+ *(uint8_t *)addr = value;
+ return;
+ case 2:
+ *(uint16_t *)addr = value;
+ return;
+ case 4:
+ *(uint32_t *)addr = value;
+ return;
+ case 8:
+ *(uint64_t *)addr = value;
+ return;
+ }
+ ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+ switch (len) {
+ case 1:
+ return (*(uint8_t *)addr);
+ case 2:
+ return (*(uint16_t *)addr);
+ case 4:
+ return (*(uint32_t *)addr);
+ case 8:
+ return (*(uint64_t *)addr);
+ }
+ ASSERT(!"bad int len");
+ return (0xFEEDFACEDEADBEEFULL);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+{
+ int i;
+ zap_leaf_t l;
+ l.l_bs = highbit(size)-1;
+ l.l_phys = buf;
+
+ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
+ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
+ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
+ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
+ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
+ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
+ struct zap_leaf_entry *le;
+
+ switch (lc->l_free.lf_type) {
+ case ZAP_CHUNK_ENTRY:
+ le = &lc->l_entry;
+
+ le->le_type = BSWAP_8(le->le_type);
+ le->le_int_size = BSWAP_8(le->le_int_size);
+ le->le_next = BSWAP_16(le->le_next);
+ le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+ le->le_name_length = BSWAP_16(le->le_name_length);
+ le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+ le->le_value_length = BSWAP_16(le->le_value_length);
+ le->le_cd = BSWAP_32(le->le_cd);
+ le->le_hash = BSWAP_64(le->le_hash);
+ break;
+ case ZAP_CHUNK_FREE:
+ lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
+ lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
+ break;
+ case ZAP_CHUNK_ARRAY:
+ lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
+ lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
+ /* la_array doesn't need swapping */
+ break;
+ default:
+ ASSERT(!"bad leaf type");
+ }
+ }
+}
+
+void
+zap_leaf_init(zap_leaf_t *l)
+{
+ int i;
+
+ l->l_bs = highbit(l->l_dbuf->db_size)-1;
+ zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
+ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
+ }
+ ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
+ l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
+ l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+ int chunk;
+
+ ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+
+ chunk = l->l_phys->l_hdr.lh_freelist;
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
+
+ l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+
+ l->l_phys->l_hdr.lh_nfree--;
+
+ return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+ struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
+ ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
+
+ zlf->lf_type = ZAP_CHUNK_FREE;
+ zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ l->l_phys->l_hdr.lh_freelist = chunk;
+
+ l->l_phys->l_hdr.lh_nfree++;
+}
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(zap_leaf_t *l, const char *buf,
+ int integer_size, int num_integers)
+{
+ uint16_t chunk_head;
+ uint16_t *chunkp = &chunk_head;
+ int byten = 0;
+ uint64_t value;
+ int shift = (integer_size-1)*8;
+ int len = num_integers;
+
+ ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+ while (len > 0) {
+ uint16_t chunk = zap_leaf_chunk_alloc(l);
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int i;
+
+ la->la_type = ZAP_CHUNK_ARRAY;
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ if (byten == 0)
+ value = ldv(integer_size, buf);
+ la->la_array[i] = value >> shift;
+ value <<= 8;
+ if (++byten == integer_size) {
+ byten = 0;
+ buf += integer_size;
+ if (--len == 0)
+ break;
+ }
+ }
+
+ *chunkp = chunk;
+ chunkp = &la->la_next;
+ }
+ *chunkp = CHAIN_END;
+
+ return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
+{
+ uint16_t chunk = *chunkp;
+
+ *chunkp = CHAIN_END;
+
+ while (chunk != CHAIN_END) {
+ int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
+ ZAP_CHUNK_ARRAY);
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ }
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
+ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+ char *buf)
+{
+ int len = MIN(array_len, buf_len);
+ int byten = 0;
+ uint64_t value = 0;
+
+ ASSERT3U(array_int_len, <=, buf_int_len);
+
+ /* Fast path for one 8-byte integer */
+ if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ uint8_t *ip = la->la_array;
+ uint64_t *buf64 = (uint64_t *)buf;
+
+ *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+ (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+ (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+ (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+ return;
+ }
+
+ /* Fast path for an array of 1-byte integers (eg. the entry name) */
+ if (array_int_len == 1 && buf_int_len == 1 &&
+ buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
+ buf += ZAP_LEAF_ARRAY_BYTES;
+ chunk = la->la_next;
+ }
+ return;
+ }
+
+ while (len > 0) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int i;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ value = (value << 8) | la->la_array[i];
+ byten++;
+ if (byten == array_int_len) {
+ stv(buf_int_len, buf, value);
+ byten = 0;
+ len--;
+ if (len == 0)
+ return;
+ buf += buf_int_len;
+ }
+ }
+ chunk = la->la_next;
+ }
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * buf is null-terminated.
+ */
+static int
+zap_leaf_array_equal(zap_leaf_t *l, int chunk,
+ int array_len, const char *buf)
+{
+ int bseen = 0;
+
+ while (bseen < array_len) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ if (bcmp(la->la_array, buf + bseen, toread))
+ break;
+ chunk = la->la_next;
+ bseen += toread;
+ }
+ return (bseen == array_len);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l,
+ const char *name, uint64_t h, zap_entry_handle_t *zeh)
+{
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (chunkp = LEAF_HASH_ENTPTR(l, h);
+ *chunkp != CHAIN_END; chunkp = &le->le_next) {
+ uint16_t chunk = *chunkp;
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_hash != h)
+ continue;
+
+ if (zap_leaf_array_equal(l, le->le_name_chunk,
+ le->le_name_length, name)) {
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+ zeh->zeh_leaf = l;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+#define HCD_GTEQ(h1, cd1, h2, cd2) \
+ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint64_t besth = -1ULL;
+ uint32_t bestcd = ZAP_MAXCD;
+ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
+ uint16_t lh;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (chunk = l->l_phys->l_hash[lh];
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+ HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
+ ASSERT3U(bestlh, >=, lh);
+ bestlh = lh;
+ besth = le->le_hash;
+ bestcd = le->le_cd;
+
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_fakechunk = chunk;
+ zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+ zeh->zeh_leaf = l;
+ }
+ }
+ }
+
+ return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_int_size > integer_size)
+ return (EINVAL);
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
+ le->le_value_length, integer_size, num_integers, buf);
+
+ if (zeh->zeh_num_integers > num_integers)
+ return (EOVERFLOW);
+ return (0);
+
+}
+
+int
+zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_length, 1, buflen, buf);
+ if (le->le_name_length > buflen)
+ return (EOVERFLOW);
+ return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+ int delta_chunks;
+ zap_leaf_t *l = zeh->zeh_leaf;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
+
+ delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+
+ if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+ return (EAGAIN);
+
+ /*
+ * We should search other chained leaves (via
+ * zap_entry_remove,create?) otherwise returning EAGAIN will
+ * just send us into an infinite loop if we have to chain
+ * another leaf block, rather than being able to split this
+ * block.
+ */
+
+ zap_leaf_array_free(l, &le->le_value_chunk);
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_length = num_integers;
+ le->le_int_size = integer_size;
+ return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+ uint16_t entry_chunk;
+ struct zap_leaf_entry *le;
+ zap_leaf_t *l = zeh->zeh_leaf;
+
+ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+ entry_chunk = *zeh->zeh_chunkp;
+ le = ZAP_LEAF_ENTRY(l, entry_chunk);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_free(l, &le->le_name_chunk);
+ zap_leaf_array_free(l, &le->le_value_chunk);
+
+ *zeh->zeh_chunkp = le->le_next;
+ zap_leaf_chunk_free(l, entry_chunk);
+
+ l->l_phys->l_hdr.lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+ uint64_t namelen, valuelen;
+ int numchunks;
+
+ valuelen = integer_size * num_integers;
+ namelen = strlen(name) + 1;
+ ASSERT(namelen >= 2);
+
+ numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
+ ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
+ return (E2BIG);
+
+ if (cd == ZAP_MAXCD) {
+ for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_hash == h &&
+ le->le_cd == cd) {
+ break;
+ }
+ }
+ /* If this cd is not in use, we are good. */
+ if (chunk == CHAIN_END)
+ break;
+ }
+ /* If we tried all the cd's, we lose. */
+ if (cd == ZAP_MAXCD)
+ return (ENOSPC);
+ }
+
+ if (l->l_phys->l_hdr.lh_nfree < numchunks)
+ return (EAGAIN);
+
+ /* make the entry */
+ chunk = zap_leaf_chunk_alloc(l);
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ le->le_type = ZAP_CHUNK_ENTRY;
+ le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
+ le->le_name_length = namelen;
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_length = num_integers;
+ le->le_int_size = integer_size;
+ le->le_hash = h;
+ le->le_cd = cd;
+
+ /* link it into the hash chain */
+ chunkp = LEAF_HASH_ENTPTR(l, h);
+ le->le_next = *chunkp;
+ *chunkp = chunk;
+
+ l->l_phys->l_hdr.lh_nentries++;
+
+ zeh->zeh_leaf = l;
+ zeh->zeh_num_integers = num_integers;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+
+ return (0);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static void
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
+ le->le_next = *ptr;
+ *ptr = entry;
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+ uint16_t new_chunk;
+ uint16_t *nchunkp = &new_chunk;
+
+ while (chunk != CHAIN_END) {
+ uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_array *nla =
+ &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int nextchunk = la->la_next;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
+
+ *nla = *la; /* structure assignment */
+
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ *nchunkp = nchunk;
+ nchunkp = &nla->la_next;
+ }
+ *nchunkp = CHAIN_END;
+ return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+{
+ struct zap_leaf_entry *le, *nle;
+ uint16_t chunk;
+
+ le = ZAP_LEAF_ENTRY(l, entry);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ chunk = zap_leaf_chunk_alloc(nl);
+ nle = ZAP_LEAF_ENTRY(nl, chunk);
+ *nle = *le; /* structure assignment */
+
+ zap_leaf_rehash_entry(nl, chunk);
+
+ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+ nle->le_value_chunk =
+ zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+ zap_leaf_chunk_free(l, entry);
+
+ l->l_phys->l_hdr.lh_nentries--;
+ nl->l_phys->l_hdr.lh_nentries++;
+}
+
+/*
+ * Transfer the entries whose hash prefix ends in 1 to the new leaf.
+ */
+void
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
+{
+ int i;
+ int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+
+ /* set new prefix and prefix_len */
+ l->l_phys->l_hdr.lh_prefix <<= 1;
+ l->l_phys->l_hdr.lh_prefix_len++;
+ nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
+ nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+
+ /* break existing hash chains */
+ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+
+ /*
+ * Transfer entries whose hash bit 'bit' is set to nl; rehash
+ * the remaining entries
+ *
+ * NB: We could find entries via the hashtable instead. That
+ * would be O(hashents+numents) rather than O(numblks+numents),
+ * but this accesses memory more sequentially, and when we're
+ * called, the block is usually pretty full.
+ */
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
+ if (le->le_type != ZAP_CHUNK_ENTRY)
+ continue;
+
+ if (le->le_hash & (1ULL << bit))
+ zap_leaf_transfer_entry(l, i, nl);
+ else
+ zap_leaf_rehash_entry(l, i);
+ }
+}
+
+void
+zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+ int i, n;
+
+ n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ l->l_phys->l_hdr.lh_prefix_len;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_2n_pointers[n]++;
+
+
+ n = l->l_phys->l_hdr.lh_nentries/5;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_with_n5_entries[n]++;
+
+ n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
+ l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ (1<<FZAP_BLOCK_SHIFT(zap));
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_n_tenths_full[n]++;
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ int nentries = 0;
+ int chunk = l->l_phys->l_hash[i];
+
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(l, chunk);
+
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
+ le->le_int_size);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_entries_using_n_chunks[n]++;
+
+ chunk = le->le_next;
+ nentries++;
+ }
+
+ n = nentries;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_buckets_with_n_entries[n]++;
+ }
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
new file mode 100644
index 000000000000..9b7e23c8fb64
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -0,0 +1,855 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/avl.h>
+
+
+static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+ int i, max;
+ buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+ buf->mz_salt = BSWAP_64(buf->mz_salt);
+ max = (size / MZAP_ENT_LEN) - 1;
+ for (i = 0; i < max; i++) {
+ buf->mz_chunk[i].mze_value =
+ BSWAP_64(buf->mz_chunk[i].mze_value);
+ buf->mz_chunk[i].mze_cd =
+ BSWAP_32(buf->mz_chunk[i].mze_cd);
+ }
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)buf;
+
+ if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+ /* ASSERT(magic == ZAP_LEAF_MAGIC); */
+ mzap_byteswap(buf, size);
+ } else {
+ fzap_byteswap(buf, size);
+ }
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+ const mzap_ent_t *mze1 = arg1;
+ const mzap_ent_t *mze2 = arg2;
+
+ if (mze1->mze_hash > mze2->mze_hash)
+ return (+1);
+ if (mze1->mze_hash < mze2->mze_hash)
+ return (-1);
+ if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ return (+1);
+ if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ return (-1);
+ return (0);
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+{
+ mzap_ent_t *mze;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(mzep->mze_cd < ZAP_MAXCD);
+ ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
+
+ mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mze->mze_chunkid = chunkid;
+ mze->mze_hash = hash;
+ mze->mze_phys = *mzep;
+ avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_t *zap, const char *name, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(zap_hash(zap, name), ==, hash);
+
+ if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
+ return (NULL);
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ mze = avl_find(avl, &mze_tofind, &idx);
+ if (mze == NULL)
+ mze = avl_nearest(avl, idx, AVL_AFTER);
+ for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (strcmp(name, mze->mze_phys.mze_name) == 0)
+ return (mze);
+ }
+ return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+ uint32_t cd;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ cd = 0;
+ for (mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (mze->mze_phys.mze_cd != cd)
+ break;
+ cd++;
+ }
+
+ return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ avl_remove(&zap->zap_m.zap_avl, mze);
+ kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+ mzap_ent_t *mze;
+ void *avlcookie = NULL;
+
+ while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+ kmem_free(mze, sizeof (mzap_ent_t));
+ avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+ zap_t *winner;
+ zap_t *zap;
+ int i;
+
+ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+ zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ zap->zap_objset = os;
+ zap->zap_object = obj;
+ zap->zap_dbuf = db;
+
+ if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
+ MUTEX_DEFAULT, 0);
+ zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
+ } else {
+ zap->zap_ismicro = TRUE;
+ }
+
+ /*
+ * Make sure that zap_ismicro is set before we let others see
+ * it, because zap_lockdir() checks zap_ismicro without the lock
+ * held.
+ */
+ winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+
+ if (winner != NULL) {
+ if (!zap->zap_ismicro)
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+ kmem_free(zap, sizeof (zap_t));
+ return (winner);
+ }
+
+ if (zap->zap_ismicro) {
+ zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+ avl_create(&zap->zap_m.zap_avl, mze_compare,
+ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze =
+ &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0]) {
+ zap->zap_m.zap_num_entries++;
+ mze_insert(zap, i,
+ zap_hash(zap, mze->mze_name), mze);
+ }
+ }
+ } else {
+ zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+
+ ASSERT3U(sizeof (struct zap_leaf_header), ==,
+ 2*ZAP_LEAF_CHUNKSIZE);
+
+ /*
+ * The embedded pointer table should not overlap the
+ * other members.
+ */
+ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
+ &zap->zap_f.zap_phys->zap_salt);
+
+ /*
+ * The embedded pointer table should end at the end of
+ * the block
+ */
+ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
+ 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
+ (uintptr_t)zap->zap_f.zap_phys, ==,
+ zap->zap_dbuf->db_size);
+ }
+ rw_exit(&zap->zap_rwlock);
+ return (zap);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, int fatreader, zap_t **zapp)
+{
+ zap_t *zap;
+ dmu_buf_t *db;
+ krw_t lt;
+ int err;
+
+ *zapp = NULL;
+
+ err = dmu_buf_hold(os, obj, 0, NULL, &db);
+ if (err)
+ return (err);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ zap = dmu_buf_get_user(db);
+ if (zap == NULL)
+ zap = mzap_open(os, obj, db);
+
+ /*
+ * We're checking zap_ismicro without the lock held, in order to
+ * tell what type of lock we want. Once we have some sort of
+ * lock, see if it really is the right type. In practice this
+ * can only be different if it was upgraded from micro to fat,
+ * and micro wanted WRITER but fat only needs READER.
+ */
+ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ rw_enter(&zap->zap_rwlock, lt);
+ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+ /* it was upgraded, now we only need reader */
+ ASSERT(lt == RW_WRITER);
+ ASSERT(RW_READER ==
+ (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+ rw_downgrade(&zap->zap_rwlock);
+ lt = RW_READER;
+ }
+
+ zap->zap_objset = os;
+
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3P(zap->zap_dbuf, ==, db);
+
+ ASSERT(!zap->zap_ismicro ||
+ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+ if (zap->zap_ismicro && tx &&
+ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+ if (newsz > MZAP_MAX_BLKSZ) {
+ dprintf("upgrading obj %llu: num_entries=%u\n",
+ obj, zap->zap_m.zap_num_entries);
+ mzap_upgrade(zap, tx);
+ *zapp = zap;
+ return (0);
+ }
+ err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
+ ASSERT3U(err, ==, 0);
+ zap->zap_m.zap_num_chunks =
+ db->db_size / MZAP_ENT_LEN - 1;
+ }
+
+ *zapp = zap;
+ return (0);
+}
+
+void
+zap_unlockdir(zap_t *zap)
+{
+ rw_exit(&zap->zap_rwlock);
+ dmu_buf_rele(zap->zap_dbuf, NULL);
+}
+
+static void
+mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ mzap_phys_t *mzp;
+ int i, sz, nchunks, err;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ sz = zap->zap_dbuf->db_size;
+ mzp = kmem_alloc(sz, KM_SLEEP);
+ bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ nchunks = zap->zap_m.zap_num_chunks;
+
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ ASSERT(err == 0);
+
+ dprintf("upgrading obj=%llu with %u chunks\n",
+ zap->zap_object, nchunks);
+ mze_destroy(zap);
+
+ fzap_upgrade(zap, tx);
+
+ for (i = 0; i < nchunks; i++) {
+ int err;
+ mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+ if (mze->mze_name[0] == 0)
+ continue;
+ dprintf("adding %s=%llu\n",
+ mze->mze_name, mze->mze_value);
+ err = fzap_add_cd(zap,
+ mze->mze_name, 8, 1, &mze->mze_value,
+ mze->mze_cd, tx);
+ ASSERT3U(err, ==, 0);
+ }
+ kmem_free(mzp, sz);
+}
+
+uint64_t
+zap_hash(zap_t *zap, const char *name)
+{
+ const uint8_t *cp;
+ uint8_t c;
+ uint64_t crc = zap->zap_salt;
+
+ ASSERT(crc != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+
+ /*
+ * Only use 28 bits, since we need 4 bits in the cookie for the
+ * collision differentiator. We MUST use the high bits, since
+ * those are the onces that we first pay attention to when
+ * chosing the bucket.
+ */
+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+ return (crc);
+}
+
+
+static void
+mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ mzap_phys_t *zp;
+
+ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ dmu_buf_will_dirty(db, tx);
+ zp = db->db_data;
+ zp->mz_block_type = ZBT_MICRO;
+ zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+ ASSERT(zp->mz_salt != 0);
+ dmu_buf_rele(db, FTAG);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int err;
+
+ err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ if (err != 0)
+ return (err);
+ mzap_create_impl(os, obj, tx);
+ return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ mzap_create_impl(os, obj, tx);
+ return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+ /*
+ * dmu_object_free will free the object number and free the
+ * data. Freeing the data will cause our pageout function to be
+ * called, which will destroy our data (zap_leaf_t's and zap_t).
+ */
+
+ return (dmu_object_free(os, zapobj, tx));
+}
+
+_NOTE(ARGSUSED(0))
+void
+zap_evict(dmu_buf_t *db, void *vzap)
+{
+ zap_t *zap = vzap;
+
+ rw_destroy(&zap->zap_rwlock);
+
+ if (zap->zap_ismicro)
+ mze_destroy(zap);
+ else
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+ kmem_free(zap, sizeof (zap_t));
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_count(zap, count);
+ } else {
+ *count = zap->zap_m.zap_num_entries;
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_lookup(zap, name,
+ integer_size, num_integers, buf);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (num_integers < 1)
+ err = EOVERFLOW;
+ else if (integer_size != 8)
+ err = EINVAL;
+ else
+ *(uint64_t *)buf = mze->mze_phys.mze_value;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_length(zap, name, integer_size, num_integers);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (integer_size)
+ *integer_size = 8;
+ if (num_integers)
+ *num_integers = 1;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+static void
+mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
+{
+ int i;
+ int start = zap->zap_m.zap_alloc_next;
+ uint32_t cd;
+
+ dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ ASSERT(strcmp(name, mze->mze_name) != 0);
+ }
+#endif
+
+ cd = mze_find_unused_cd(zap, hash);
+ /* given the limited size of the microzap, this can't happen */
+ ASSERT(cd != ZAP_MAXCD);
+
+again:
+ for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0] == 0) {
+ mze->mze_value = value;
+ mze->mze_cd = cd;
+ (void) strcpy(mze->mze_name, name);
+ zap->zap_m.zap_num_entries++;
+ zap->zap_m.zap_alloc_next = i+1;
+ if (zap->zap_m.zap_alloc_next ==
+ zap->zap_m.zap_num_chunks)
+ zap->zap_m.zap_alloc_next = 0;
+ mze_insert(zap, i, hash, mze);
+ return;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+ ASSERT(!"out of entries!");
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ uint64_t hash;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ mzap_upgrade(zap, tx);
+ err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+ } else {
+ hash = zap_hash(zap, name);
+ mze = mze_find(zap, name, hash);
+ if (mze != NULL) {
+ err = EEXIST;
+ } else {
+ mzap_addent(zap, name, hash, *intval);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ uint64_t hash;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ if (!zap->zap_ismicro) {
+ err = fzap_update(zap, name,
+ integer_size, num_integers, val, tx);
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ mzap_upgrade(zap, tx);
+ err = fzap_update(zap, name,
+ integer_size, num_integers, val, tx);
+ } else {
+ hash = zap_hash(zap, name);
+ mze = mze_find(zap, name, hash);
+ if (mze != NULL) {
+ mze->mze_phys.mze_value = *intval;
+ zap->zap_m.zap_phys->mz_chunk
+ [mze->mze_chunkid].mze_value = *intval;
+ } else {
+ mzap_addent(zap, name, hash, *intval);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_remove(zap, name, tx);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ dprintf("fail: %s\n", name);
+ err = ENOENT;
+ } else {
+ dprintf("success: %s\n", name);
+ zap->zap_m.zap_num_entries--;
+ bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mzap_ent_phys_t));
+ mze_remove(zap, mze);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+/*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So use a small hash value so
+ * we can fit 4 bits of cd into the 32-bit cursor.
+ *
+ * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
+ */
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zc->zc_objset = os;
+ zc->zc_zap = NULL;
+ zc->zc_leaf = NULL;
+ zc->zc_zapobj = zapobj;
+ if (serialized == -1ULL) {
+ zc->zc_hash = -1ULL;
+ zc->zc_cd = 0;
+ } else {
+ zc->zc_hash = serialized << (64-ZAP_HASHBITS);
+ zc->zc_cd = serialized >> ZAP_HASHBITS;
+ if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
+ zc->zc_cd = 0;
+ }
+}
+
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_serialized(zc, os, zapobj, 0);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+ if (zc->zc_zap) {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ zap_unlockdir(zc->zc_zap);
+ zc->zc_zap = NULL;
+ }
+ if (zc->zc_leaf) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+ zc->zc_objset = NULL;
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return (-1ULL);
+ ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
+ ASSERT(zc->zc_cd < ZAP_MAXCD);
+ return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
+ ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err;
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+
+ if (zc->zc_hash == -1ULL)
+ return (ENOENT);
+
+ if (zc->zc_zap == NULL) {
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, &zc->zc_zap);
+ if (err)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+ } else {
+ err = ENOENT;
+
+ mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+
+ mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
+ &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mze->mze_phys)));
+ if (mze == NULL) {
+ mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+ idx, AVL_AFTER);
+ }
+ if (mze) {
+ za->za_integer_length = 8;
+ za->za_num_integers = 1;
+ za->za_first_integer = mze->mze_phys.mze_value;
+ (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_phys.mze_cd;
+ err = 0;
+ } else {
+ zc->zc_hash = -1ULL;
+ }
+ }
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return;
+ zc->zc_cd++;
+ if (zc->zc_cd >= ZAP_MAXCD) {
+ zc->zc_cd = 0;
+ zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
+ if (zc->zc_hash == 0) /* EOF */
+ zc->zc_hash = -1ULL;
+ }
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+ int err;
+ zap_t *zap;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+
+ bzero(zs, sizeof (zap_stats_t));
+
+ if (zap->zap_ismicro) {
+ zs->zs_blocksize = zap->zap_dbuf->db_size;
+ zs->zs_num_entries = zap->zap_m.zap_num_entries;
+ zs->zs_num_blocks = 1;
+ } else {
+ fzap_get_stats(zap, zs);
+ }
+ zap_unlockdir(zap);
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
new file mode 100644
index 000000000000..09881909b804
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+name="zfs" parent="pseudo";
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
new file mode 100644
index 000000000000..030424b84317
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -0,0 +1,1607 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <acl/acl_common.h>
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+#define SECURE_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define OGE_PAD 6 /* traditional owner/group/everyone ACES */
+
+static int zfs_ace_can_use(znode_t *zp, ace_t *);
+
+static zfs_acl_t *
+zfs_acl_alloc(int slots)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ if (slots != 0) {
+ aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
+ aclp->z_acl_count = 0;
+ aclp->z_state = ACL_DATA_ALLOCED;
+ } else {
+ aclp->z_state = 0;
+ }
+ aclp->z_slots = slots;
+ return (aclp);
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ if (aclp->z_state == ACL_DATA_ALLOCED) {
+ kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
+ }
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static uint32_t
+zfs_v4_to_unix(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ /*
+ * This is used for mapping v4 permissions into permissions
+ * that can be passed to secpolicy_vnode_access()
+ */
+ if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY |
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL))
+ new_mask |= S_IROTH;
+ if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA |
+ ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS))
+ new_mask |= S_IWOTH;
+ if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS))
+ new_mask |= S_IXOTH;
+
+ return (new_mask);
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & 01)
+ new_mask |= (ACE_EXECUTE);
+ if (access_mask & 02) {
+ new_mask |= (ACE_WRITE_DATA);
+ } if (access_mask & 04) {
+ new_mask |= ACE_READ_DATA;
+ }
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
+ uid_t uid, int entry_type)
+{
+ zacep->a_access_mask = access_mask;
+ zacep->a_type = access_type;
+ zacep->a_who = uid;
+ zacep->a_flags = entry_type;
+}
+
+static uint64_t
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+{
+ int i;
+ int entry_type;
+ mode_t mode = (zp->z_phys->zp_mode &
+ (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+ mode_t seen = 0;
+ ace_t *acep;
+
+ for (i = 0, acep = aclp->z_acl;
+ i != aclp->z_acl_count; i++, acep++) {
+ entry_type = (acep->a_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER) {
+ if ((acep->a_access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP) {
+ if ((acep->a_access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((acep->a_access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ }
+ }
+ return (mode);
+}
+
+static zfs_acl_t *
+zfs_acl_node_read_internal(znode_t *zp)
+{
+ zfs_acl_t *aclp;
+
+ aclp = zfs_acl_alloc(0);
+ aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+ aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
+
+ return (aclp);
+}
+
+/*
+ * Read an external acl object.
+ */
+static int
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
+{
+ uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
+ zfs_acl_t *aclp;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
+ *aclpp = zfs_acl_node_read_internal(zp);
+ return (0);
+ }
+
+ aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
+
+ error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+ ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+
+ *aclpp = aclp;
+ return (0);
+}
+
+static boolean_t
+zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
+{
+ ace_t *acep;
+ int i;
+
+ *inherit = 0;
+
+ if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
+ return (B_FALSE);
+ }
+
+ for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
+
+ /*
+ * first check type of entry
+ */
+
+ switch (acep->a_flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ acep->a_who = -1;
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ case ACE_IDENTIFIER_GROUP:
+ if (acep->a_flags & ACE_GROUP) {
+ acep->a_who = -1;
+ }
+ break;
+ case ACE_EVERYONE:
+ acep->a_who = -1;
+ break;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (acep->a_type != ALLOW && acep->a_type != DENY)
+ return (B_FALSE);
+
+ /*
+ * Only directories should have inheritance flags.
+ */
+ if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
+ return (B_FALSE);
+ }
+
+ if (acep->a_flags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
+ *inherit = 1;
+
+ if (acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+ }
+
+ return (B_TRUE);
+}
+/*
+ * common code for setting acl's.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
+{
+ int inherit = 0;
+ int error;
+ znode_phys_t *zphys = zp->z_phys;
+ zfs_znode_acl_t *zacl = &zphys->zp_acl;
+ uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (ihp)
+ inherit = *ihp; /* already determined by caller */
+ else if (!zfs_acl_valid(zp, aclp->z_acl,
+ aclp->z_acl_count, &inherit)) {
+ return (EINVAL);
+ }
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ /*
+ * Will ACL fit internally?
+ */
+ if (aclp->z_acl_count > ACE_SLOT_CNT) {
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
+ acl_phys_size, 0, tx);
+ }
+ zphys->zp_acl.z_acl_extern_obj = aoid;
+ zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+ dmu_write(zfsvfs->z_os, aoid, 0,
+ acl_phys_size, aclp->z_acl, tx);
+ } else {
+ /*
+ * Migrating back embedded?
+ */
+ if (zphys->zp_acl.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ zphys->zp_acl.z_acl_extern_obj = 0;
+ }
+ bcopy(aclp->z_acl, zacl->z_ace_data,
+ aclp->z_acl_count * sizeof (ace_t));
+ zacl->z_acl_count = aclp->z_acl_count;
+ }
+
+ zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE);
+ if (inherit) {
+ zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
+ } else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) {
+ zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
+ }
+
+ zphys->zp_mode = zfs_mode_compute(zp, aclp);
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+ return (0);
+}
+
+/*
+ * Create space for slots_needed ACEs to be append
+ * to aclp.
+ */
+static void
+zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
+{
+ ace_t *newacep;
+ ace_t *oldaclp;
+ int slot_cnt;
+ int slots_left = aclp->z_slots - aclp->z_acl_count;
+
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+ if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
+ slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left);
+ newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
+ bcopy(aclp->z_acl, newacep,
+ ZFS_ACL_SIZE(aclp->z_acl_count));
+ oldaclp = aclp->z_acl;
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
+ aclp->z_acl = newacep;
+ aclp->z_slots = slot_cnt;
+ aclp->z_state = ACL_DATA_ALLOCED;
+ }
+}
+
+/*
+ * Remove "slot" ACE from aclp
+ */
+static void
+zfs_ace_remove(zfs_acl_t *aclp, int slot)
+{
+ if (aclp->z_acl_count > 1) {
+ (void) memmove(&aclp->z_acl[slot],
+ &aclp->z_acl[slot +1], sizeof (ace_t) *
+ (--aclp->z_acl_count - slot));
+ } else
+ aclp->z_acl_count--;
+}
+
+/*
+ * Update access mask for prepended ACE
+ *
+ * This applies the "groupmask" value for aclmode property.
+ */
+static void
+zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
+{
+
+ int rmask, wmask, xmask;
+ int user_ace;
+
+ user_ace = (!(acep->a_flags &
+ (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
+
+ if (user_ace && (acep->a_who == owner)) {
+ rmask = S_IRUSR;
+ wmask = S_IWUSR;
+ xmask = S_IXUSR;
+ } else {
+ rmask = S_IRGRP;
+ wmask = S_IWGRP;
+ xmask = S_IXGRP;
+ }
+
+ if (origacep->a_access_mask & ACE_READ_DATA) {
+ if (mode & rmask)
+ acep->a_access_mask &= ~ACE_READ_DATA;
+ else
+ acep->a_access_mask |= ACE_READ_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_WRITE_DATA) {
+ if (mode & wmask)
+ acep->a_access_mask &= ~ACE_WRITE_DATA;
+ else
+ acep->a_access_mask |= ACE_WRITE_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_APPEND_DATA) {
+ if (mode & wmask)
+ acep->a_access_mask &= ~ACE_APPEND_DATA;
+ else
+ acep->a_access_mask |= ACE_APPEND_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_EXECUTE) {
+ if (mode & xmask)
+ acep->a_access_mask &= ~ACE_EXECUTE;
+ else
+ acep->a_access_mask |= ACE_EXECUTE;
+ }
+}
+
+/*
+ * Apply mode to canonical six ACEs.
+ */
+static void
+zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
+{
+ int cnt;
+ ace_t *acep;
+
+ cnt = aclp->z_acl_count -1;
+ acep = aclp->z_acl;
+
+ /*
+ * Fixup final ACEs to match the mode
+ */
+
+ ASSERT(cnt >= 5);
+ adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */
+ adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */
+ adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */
+}
+
+
+static int
+zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
+{
+ return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
+ ((acep->a_flags & ACE_TYPE_FLAGS) == type));
+}
+
+/*
+ * Can prepended ACE be reused?
+ */
+static int
+zfs_reuse_deny(ace_t *acep, int i)
+{
+ int okay_masks;
+
+ if (i < 1)
+ return (B_FALSE);
+
+ if (acep[i-1].a_type != DENY)
+ return (B_FALSE);
+
+ if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
+ return (B_FALSE);
+
+ okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
+
+ if (acep[i-1].a_access_mask & ~okay_masks)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Create space to prepend an ACE
+ */
+static void
+zfs_acl_prepend(zfs_acl_t *aclp, int i)
+{
+ ace_t *oldaclp = NULL;
+ ace_t *to, *from;
+ int slots_left = aclp->z_slots - aclp->z_acl_count;
+ int oldslots;
+ int need_free = 0;
+
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+ if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
+
+ to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
+ OGE_PAD), KM_SLEEP);
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ need_free++;
+ from = aclp->z_acl;
+ oldaclp = aclp->z_acl;
+ (void) memmove(to, from,
+ sizeof (ace_t) * aclp->z_acl_count);
+ aclp->z_state = ACL_DATA_ALLOCED;
+ } else {
+ from = aclp->z_acl;
+ to = aclp->z_acl;
+ }
+
+
+ (void) memmove(&to[i + 1], &from[i],
+ sizeof (ace_t) * (aclp->z_acl_count - i));
+
+ if (oldaclp) {
+ aclp->z_acl = to;
+ oldslots = aclp->z_slots;
+ aclp->z_slots = aclp->z_acl_count + OGE_PAD;
+ if (need_free)
+ kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
+ }
+
+}
+
+/*
+ * Prepend deny ACE
+ */
+static void
+zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
+ mode_t mode)
+{
+ ace_t *acep;
+
+ zfs_acl_prepend(aclp, i);
+
+ acep = aclp->z_acl;
+ zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
+ (acep[i + 1].a_flags & ACE_TYPE_FLAGS));
+ zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
+ aclp->z_acl_count++;
+}
+
+/*
+ * Split an inherited ACE into inherit_only ACE
+ * and original ACE with inheritance flags stripped off.
+ */
+static void
+zfs_acl_split_ace(zfs_acl_t *aclp, int i)
+{
+ ace_t *acep = aclp->z_acl;
+
+ zfs_acl_prepend(aclp, i);
+ acep = aclp->z_acl;
+ acep[i] = acep[i + 1];
+ acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
+ acep[i + 1].a_flags &= ~ALL_INHERIT;
+ aclp->z_acl_count++;
+}
+
+/*
+ * Are ACES started at index i, the canonical six ACES?
+ */
+static int
+zfs_have_canonical_six(zfs_acl_t *aclp, int i)
+{
+ ace_t *acep = aclp->z_acl;
+
+ if ((zfs_acl_ace_match(&acep[i],
+ DENY, ACE_OWNER, 0) &&
+ zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
+ OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
+ DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
+ ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
+ DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
+ zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
+ EVERYONE_ALLOW_MASK))) {
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Apply step 1g, to group entries
+ *
+ * Need to deal with corner case where group may have
+ * greater permissions than owner. If so then limit
+ * group permissions, based on what extra permissions
+ * group has.
+ */
+static void
+zfs_fixup_group_entries(ace_t *acep, mode_t mode)
+{
+ mode_t extramode = (mode >> 3) & 07;
+ mode_t ownermode = (mode >> 6);
+
+ if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
+
+ extramode &= ~ownermode;
+
+ if (extramode) {
+ if (extramode & 04) {
+ acep[0].a_access_mask &= ~ACE_READ_DATA;
+ acep[1].a_access_mask &= ~ACE_READ_DATA;
+ }
+ if (extramode & 02) {
+ acep[0].a_access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ acep[1].a_access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ }
+ if (extramode & 01) {
+ acep[0].a_access_mask &= ~ACE_EXECUTE;
+ acep[1].a_access_mask &= ~ACE_EXECUTE;
+ }
+ }
+ }
+}
+
+/*
+ * Apply the chmod algorithm as described
+ * in PSARC/2002/240
+ */
+static int
+zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
+ dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *acep;
+ int i;
+ int error;
+ int entry_type;
+ int reuse_deny;
+ int need_canonical_six = 1;
+ int inherit = 0;
+ int iflags;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ i = 0;
+ while (i < aclp->z_acl_count) {
+ acep = aclp->z_acl;
+ entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS);
+ iflags = (acep[i].a_flags & ALL_INHERIT);
+
+ if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
+ (iflags & ACE_INHERIT_ONLY_ACE)) {
+ i++;
+ if (iflags)
+ inherit = 1;
+ continue;
+ }
+
+
+ if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) {
+ zfs_ace_remove(aclp, i);
+ continue;
+ }
+
+ /*
+ * Need to split ace into two?
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) &&
+ (!(iflags & ACE_INHERIT_ONLY_ACE))) {
+ zfs_acl_split_ace(aclp, i);
+ i++;
+ inherit = 1;
+ continue;
+ }
+
+ if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ (entry_type == OWNING_GROUP)) {
+ acep[i].a_access_mask &= ~OGE_CLEAR;
+ i++;
+ continue;
+
+ } else {
+ if (acep[i].a_type == ALLOW) {
+
+ /*
+ * Check preceding ACE if any, to see
+ * if we need to prepend a DENY ACE.
+ * This is only applicable when the acl_mode
+ * property == groupmask.
+ */
+ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
+
+ reuse_deny = zfs_reuse_deny(acep, i);
+
+ if (reuse_deny == B_FALSE) {
+ zfs_acl_prepend_deny(zp, aclp,
+ i, mode);
+ i++;
+ acep = aclp->z_acl;
+ } else {
+ zfs_acl_prepend_fixup(
+ &acep[i - 1],
+ &acep[i], mode,
+ zp->z_phys->zp_uid);
+ }
+ zfs_fixup_group_entries(&acep[i - 1],
+ mode);
+ }
+ }
+ i++;
+ }
+ }
+
+ /*
+ * Check out last six aces, if we have six.
+ */
+
+ if (aclp->z_acl_count >= 6) {
+ i = aclp->z_acl_count - 6;
+
+ if (zfs_have_canonical_six(aclp, i)) {
+ need_canonical_six = 0;
+ }
+ }
+
+ if (need_canonical_six) {
+
+ zfs_acl_append(aclp, 6);
+ i = aclp->z_acl_count;
+ acep = aclp->z_acl;
+ zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
+ zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
+ zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
+ zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
+ zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
+ DENY, -1, ACE_EVERYONE);
+ zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
+ ALLOW, -1, ACE_EVERYONE);
+ aclp->z_acl_count += 6;
+ }
+
+ zfs_acl_fixup_canonical_six(aclp, mode);
+
+ zp->z_phys->zp_mode = mode;
+ error = zfs_aclset_common(zp, aclp, tx, &inherit);
+ return (error);
+}
+
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
+{
+ zfs_acl_t *aclp = NULL;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ mutex_enter(&zp->z_acl_lock);
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error == 0)
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
+ mutex_exit(&zp->z_acl_lock);
+ if (aclp)
+ zfs_acl_free(aclp);
+ return (error);
+}
+
+/*
+ * strip off write_owner and write_acl
+ */
+static void
+zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
+{
+ if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) &&
+ (acep->a_type == ALLOW))
+ acep->a_access_mask &= ~SECURE_CLEAR;
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *pacep;
+ ace_t *acep;
+ int ace_cnt = 0;
+ int pace_cnt;
+ int i, j;
+ zfs_acl_t *aclp = NULL;
+
+ i = j = 0;
+ pace_cnt = paclp->z_acl_count;
+ pacep = paclp->z_acl;
+ if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
+ for (i = 0; i != pace_cnt; i++) {
+
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
+ pacep[i].a_type == ALLOW)
+ continue;
+
+ if (zfs_ace_can_use(zp, &pacep[i])) {
+ ace_cnt++;
+ if (!(pacep[i].a_flags &
+ ACE_NO_PROPAGATE_INHERIT_ACE))
+ ace_cnt++;
+ }
+ }
+ }
+
+ aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
+ if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
+ acep = aclp->z_acl;
+ pacep = paclp->z_acl;
+ for (i = 0; i != pace_cnt; i++) {
+
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
+ pacep[i].a_type == ALLOW)
+ continue;
+
+ if (zfs_ace_can_use(zp, &pacep[i])) {
+
+ /*
+ * Now create entry for inherited ace
+ */
+
+ acep[j] = pacep[i];
+
+ /*
+ * When AUDIT/ALARM a_types are supported
+ * they should be inherited here.
+ */
+
+ if ((pacep[i].a_flags &
+ ACE_NO_PROPAGATE_INHERIT_ACE) ||
+ (ZTOV(zp)->v_type != VDIR)) {
+ acep[j].a_flags &= ~ALL_INHERIT;
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ continue;
+ }
+
+ ASSERT(ZTOV(zp)->v_type == VDIR);
+
+ /*
+ * If we are inheriting an ACE targeted for
+ * only files, then make sure inherit_only
+ * is on for future propagation.
+ */
+ if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) !=
+ ACE_FILE_INHERIT_ACE) {
+ j++;
+ acep[j] = acep[j-1];
+ acep[j-1].a_flags |=
+ ACE_INHERIT_ONLY_ACE;
+ acep[j].a_flags &= ~ALL_INHERIT;
+ } else {
+ acep[j].a_flags |= ACE_INHERIT_ONLY_ACE;
+ }
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ }
+ }
+ }
+ aclp->z_acl_count = j;
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ */
+void
+zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
+ vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
+{
+ uint64_t mode;
+ uid_t uid;
+ gid_t gid;
+ int error;
+ int pull_down;
+ zfs_acl_t *aclp, *paclp;
+
+ mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+ ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+ uid = vap->va_uid;
+ gid = vap->va_gid;
+ } else {
+ uid = crgetuid(cr);
+ if ((vap->va_mask & AT_GID) &&
+ ((vap->va_gid == parent->z_phys->zp_gid) ||
+ groupmember(vap->va_gid, cr) ||
+ secpolicy_vnode_create_gid(cr) == 0))
+ gid = vap->va_gid;
+ else
+#ifdef __FreeBSD__
+ gid = parent->z_phys->zp_gid;
+#else
+ gid = (parent->z_phys->zp_mode & S_ISGID) ?
+ parent->z_phys->zp_gid : crgetgid(cr);
+#endif
+ }
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
+ mode |= S_ISGID;
+ else {
+ if ((mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(cr, gid) != 0)
+ mode &= ~S_ISGID;
+ }
+
+ zp->z_phys->zp_uid = uid;
+ zp->z_phys->zp_gid = gid;
+ zp->z_phys->zp_mode = mode;
+
+ mutex_enter(&parent->z_lock);
+ pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
+ if (pull_down) {
+ mutex_enter(&parent->z_acl_lock);
+ VERIFY(0 == zfs_acl_node_read(parent, &paclp));
+ mutex_exit(&parent->z_acl_lock);
+ aclp = zfs_acl_inherit(zp, paclp);
+ zfs_acl_free(paclp);
+ } else {
+ aclp = zfs_acl_alloc(6);
+ }
+ mutex_exit(&parent->z_lock);
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&zp->z_acl_lock);
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT3U(error, ==, 0);
+ zfs_acl_free(aclp);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(znode_t *zp, ace_t *acep)
+{
+ int vtype = ZTOV(zp)->v_type;
+
+ int iflags = (acep->a_flags & 0xf);
+
+ if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!((vtype == VDIR) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+#ifdef TODO
+/*
+ * Retrieve a files ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ int error;
+
+ if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
+ /*
+ * If owner of file then allow reading of the
+ * ACL.
+ */
+ if (crgetuid(cr) != zp->z_phys->zp_uid)
+ return (error);
+ }
+
+ if (mask == 0)
+ return (ENOSYS);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = aclp->z_acl_count;
+ }
+
+ if (mask & VSA_ACE) {
+ vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
+ sizeof (ace_t), KM_SLEEP);
+ bcopy(aclp->z_acl, vsecp->vsa_aclentp,
+ aclp->z_acl_count * sizeof (ace_t));
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_acl_free(aclp);
+
+ return (0);
+}
+#endif /* TODO */
+
+#ifdef TODO
+/*
+ * Set a files ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ ace_t *acep = vsecp->vsa_aclentp;
+ int aclcnt = vsecp->vsa_aclcnt;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ int inherit;
+ zfs_acl_t *aclp;
+
+ if (mask == 0)
+ return (EINVAL);
+
+ if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
+ return (EINVAL);
+top:
+ error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
+ if (error == EACCES || error == ACCESS_UNDETERMINED) {
+ if ((error = secpolicy_vnode_setdac(cr,
+ zp->z_phys->zp_uid)) != 0) {
+ return (error);
+ }
+ } else if (error) {
+ return (error == EROFS ? error : EPERM);
+ }
+
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&zp->z_acl_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
+ 0, ZFS_ACL_SIZE(aclcnt));
+ } else if (aclcnt > ACE_SLOT_CNT) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ aclp = zfs_acl_alloc(aclcnt);
+ bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
+ aclp->z_acl_count = aclcnt;
+ error = zfs_aclset_common(zp, aclp, tx, &inherit);
+ ASSERT(error == 0);
+
+ zfs_acl_free(aclp);
+ zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
+ dmu_tx_commit(tx);
+done:
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ return (error);
+}
+#endif /* TODO */
+
+static int
+zfs_ace_access(ace_t *zacep, int *working_mode)
+{
+ if (*working_mode == 0) {
+ return (0);
+ }
+
+ if (zacep->a_access_mask & *working_mode) {
+ if (zacep->a_type == ALLOW) {
+ *working_mode &=
+ ~(*working_mode & zacep->a_access_mask);
+ if (*working_mode == 0)
+ return (0);
+ } else if (zacep->a_type == DENY) {
+ return (EACCES);
+ }
+ }
+
+ /*
+ * haven't been specifcally denied at this point
+ * so return UNDETERMINED.
+ */
+
+ return (ACCESS_UNDETERMINED);
+}
+
+
+static int
+zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *zacep;
+ gid_t gid;
+ int cnt;
+ int i;
+ int error;
+ int access_deny = ACCESS_UNDETERMINED;
+ uint_t entry_type;
+ uid_t uid = crgetuid(cr);
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
+ *working_mode = 0;
+ return (0);
+ }
+
+ *working_mode = v4_mode;
+
+ if ((v4_mode & WRITE_MASK) &&
+ (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ (!IS_DEVVP(ZTOV(zp)))) {
+ return (EROFS);
+ }
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+
+ zacep = aclp->z_acl;
+ cnt = aclp->z_acl_count;
+
+ for (i = 0; i != cnt; i++) {
+
+ DTRACE_PROBE2(zfs__access__common,
+ ace_t *, &zacep[i], int, *working_mode);
+
+ if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS);
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == zp->z_phys->zp_uid) {
+ access_deny = zfs_ace_access(&zacep[i],
+ working_mode);
+ }
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ case ACE_IDENTIFIER_GROUP:
+ /*
+ * Owning group gid is in znode not ACL
+ */
+ if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
+ gid = zp->z_phys->zp_gid;
+ else
+ gid = zacep[i].a_who;
+
+ if (groupmember(gid, cr)) {
+ access_deny = zfs_ace_access(&zacep[i],
+ working_mode);
+ }
+ break;
+ case ACE_EVERYONE:
+ access_deny = zfs_ace_access(&zacep[i], working_mode);
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ if (uid == zacep[i].a_who) {
+ access_deny = zfs_ace_access(&zacep[i],
+ working_mode);
+ }
+ break;
+ }
+ zfs_acl_free(aclp);
+ mutex_exit(&zp->z_acl_lock);
+ return (EIO);
+ }
+
+ if (access_deny != ACCESS_UNDETERMINED)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ zfs_acl_free(aclp);
+
+ return (access_deny);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied, invoking least
+ * priv subsytem when a deny is determined.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
+{
+ int working_mode;
+ int error;
+ int is_attr;
+ znode_t *xzp;
+ znode_t *check_zp = zp;
+
+ is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
+ (ZTOV(zp)->v_type == VDIR));
+
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(zp->z_zfsvfs,
+ zp->z_phys->zp_parent, &xzp)) != 0) {
+ return (error);
+ }
+ check_zp = xzp;
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
+ error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
+
+ if (error == EROFS) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (error);
+ }
+
+ if (error || working_mode) {
+ working_mode = (zfs_v4_to_unix(working_mode) << 6);
+ error = secpolicy_vnode_access(cr, ZTOV(check_zp),
+ check_zp->z_phys->zp_uid, working_mode);
+ }
+
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+
+ return (error);
+}
+
+/*
+ * Special zaccess function to check for special nfsv4 perm.
+ * doesn't call secpolicy_vnode_access() for failure, since that
+ * would probably be the wrong policy function to call.
+ * instead its up to the caller to handle that situation.
+ */
+
+int
+zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
+{
+ int working_mode = 0;
+ return (zfs_zaccess_common(zp, mode, &working_mode, cr));
+}
+
+/*
+ * Translate tradition unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr)
+{
+ int error;
+
+ error = secpolicy_vnode_access(cr, ZTOV(zp),
+ dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC);
+
+ if (error == 0)
+ error = zfs_sticky_remove_access(dzp, zp, cr);
+
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ int dzp_working_mode = 0;
+ int zp_working_mode = 0;
+ int dzp_error, zp_error;
+
+ /*
+ * Arghh, this check is going to require a couple of questions
+ * to be asked. We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:joe:write_data:deny,user:joe:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, cr);
+ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
+
+ if (dzp_error == EROFS || zp_error == EROFS)
+ return (dzp_error);
+
+ /*
+ * First check the first row.
+ * We only need to see if parent Allows delete_child
+ */
+ if ((dzp_working_mode & ACE_DELETE_CHILD) == 0)
+ return (0);
+
+ /*
+ * Second row
+ * we already have the necessary information in
+ * zp_working_mode, zp_error and dzp_error.
+ */
+
+ if ((zp_working_mode & ACE_DELETE) == 0)
+ return (0);
+
+ /*
+ * Now zp_error should either be EACCES which indicates
+ * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete"
+ * entry exists on the target.
+ *
+ * dzp_error should be either EACCES which indicates a "deny"
+ * entry for delete_child or ACCESS_UNDETERMINED if no delete_child
+ * entry exists. If value is EACCES then we are done
+ * and zfs_delete_final_check() will make the final decision
+ * regarding to allow the delete.
+ */
+
+ ASSERT(zp_error != 0 && dzp_error != 0);
+ if (dzp_error == EACCES)
+ return (zfs_delete_final_check(zp, dzp, cr));
+
+ /*
+ * Third Row
+ * Only need to check for write/execute on parent
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
+ &dzp_working_mode, cr);
+
+ if (dzp_error == EROFS)
+ return (dzp_error);
+
+ if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0)
+ return (zfs_sticky_remove_access(dzp, zp, cr));
+
+ /*
+ * Fourth Row
+ */
+
+ if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) &&
+ ((zp_working_mode & ACE_DELETE) == 0))
+ return (zfs_sticky_remove_access(dzp, zp, cr));
+
+ return (zfs_delete_final_check(zp, dzp, cr));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ add_perm = (ZTOV(szp)->v_type == VDIR) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ */
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if (error = zfs_zaccess_delete(sdzp, szp, cr))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp) {
+ if (error = zfs_zaccess_delete(tdzp, tzp, cr))
+ return (error);
+ }
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, cr);
+
+ return (error);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
new file mode 100644
index 000000000000..c8450d488bdb
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_ace_byteswap(ace_t *ace, int ace_cnt)
+{
+ int i;
+
+ for (i = 0; i != ace_cnt; i++, ace++) {
+ ace->a_who = BSWAP_32(ace->a_who);
+ ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+ ace->a_flags = BSWAP_16(ace->a_flags);
+ ace->a_type = BSWAP_16(ace->a_type);
+ }
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+ int cnt;
+
+ /*
+ * Arggh, since we don't know how many ACEs are in
+ * the array, we have to swap the entire block
+ */
+
+ cnt = size / sizeof (ace_t);
+
+ zfs_ace_byteswap((ace_t *)buf, cnt);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+ znode_phys_t *zp = buf;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+ zp->zp_gen = BSWAP_64(zp->zp_gen);
+ zp->zp_mode = BSWAP_64(zp->zp_mode);
+ zp->zp_size = BSWAP_64(zp->zp_size);
+ zp->zp_parent = BSWAP_64(zp->zp_parent);
+ zp->zp_links = BSWAP_64(zp->zp_links);
+ zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+ zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+ zp->zp_flags = BSWAP_64(zp->zp_flags);
+ zp->zp_uid = BSWAP_64(zp->zp_uid);
+ zp->zp_gid = BSWAP_64(zp->zp_gid);
+ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+ zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
+
+ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+ zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
+ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+ zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad);
+ zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..c759962a6d11
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -0,0 +1,1120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future. The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land. The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/gfs.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/mount.h>
+
+typedef struct {
+ char *se_name;
+ vnode_t *se_root;
+ avl_node_t se_node;
+} zfs_snapentry_t;
+
+static int
+snapentry_compare(const void *a, const void *b)
+{
+ const zfs_snapentry_t *sa = a;
+ const zfs_snapentry_t *sb = b;
+ int ret = strcmp(sa->se_name, sb->se_name);
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+
+static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
+
+typedef struct zfsctl_node {
+ gfs_dir_t zc_gfs_private;
+ uint64_t zc_id;
+ timestruc_t zc_cmtime; /* ctime and mtime, always the same */
+} zfsctl_node_t;
+
+typedef struct zfsctl_snapdir {
+ zfsctl_node_t sd_node;
+ kmutex_t sd_lock;
+ avl_tree_t sd_snaps;
+} zfsctl_snapdir_t;
+
+/*
+ * Root directory elements. We have only a single static entry, 'snapshot'.
+ */
+static gfs_dirent_t zfsctl_root_entries[] = {
+ { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+ { NULL }
+};
+
+/* include . and .. in the calculation */
+#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \
+ sizeof (gfs_dirent_t)) + 1)
+
+
+/*
+ * Initialize the various GFS pieces we'll need to create and manipulate .zfs
+ * directories. This is called from the ZFS init routine, and initializes the
+ * vnode ops vectors that we'll be using.
+ */
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+/*
+ * Return the inode number associated with the 'snapshot' directory.
+ */
+/* ARGSUSED */
+static ino64_t
+zfsctl_root_inode_cb(vnode_t *vp, int index)
+{
+ ASSERT(index == 0);
+ return (ZFSCTL_INO_SNAPDIR);
+}
+
+/*
+ * Create the '.zfs' directory. This directory is cached as part of the VFS
+ * structure. This results in a hold on the vfs_t. The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1. This reference
+ * is removed when the ctldir is destroyed in the unmount.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ vnode_t *vp, *rvp;
+ zfsctl_node_t *zcp;
+
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
+ &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
+ zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
+ zcp = vp->v_data;
+ zcp->zc_id = ZFSCTL_INO_ROOT;
+
+ VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
+ ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
+ VN_URELE(rvp);
+
+ /*
+ * We're only faking the fact that we have a root of a filesystem for
+ * the sake of the GFS interfaces. Undo the flag manipulation it did
+ * for us.
+ */
+ vp->v_vflag &= ~VV_ROOT;
+
+ zfsvfs->z_ctldir = vp;
+}
+
+/*
+ * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
+ * There might still be more references if we were force unmounted, but only
+ * new zfs_inactive() calls can occur and they don't reference .zfs
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ VN_RELE(zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+vnode_t *
+zfsctl_root(znode_t *zp)
+{
+ ASSERT(zfs_has_ctldir(zp));
+ VN_HOLD(zp->z_zfsvfs->z_ctldir);
+ return (zp->z_zfsvfs->z_ctldir);
+}
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+ int flags = ap->a_mode;
+
+ if (flags & FWRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+/*
+ * Common close routine. Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+/*
+ * Common access routine. Disallow writes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_access(ap)
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ int mode = ap->a_mode;
+
+ if (mode & VWRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+/*
+ * Common getattr function. Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+ zfsctl_node_t *zcp = vp->v_data;
+ timestruc_t now;
+
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ vap->va_rdev = 0;
+ /*
+ * We are a purly virtual object, so we have no
+ * blocksize or allocated blocks.
+ */
+ vap->va_blksize = 0;
+ vap->va_nblocks = 0;
+ vap->va_seq = 0;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
+ vap->va_type = VDIR;
+ /*
+ * We live in the now (for atime).
+ */
+ gethrestime(&now);
+ vap->va_atime = now;
+ vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
+ /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_flags = 0;
+}
+
+static int
+zfsctl_common_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_node_t *zcp = vp->v_data;
+ uint64_t object = zcp->zc_id;
+ zfid_short_t *zfid;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+
+ fidp->fid_len = SHORT_FID_LEN;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs znodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfsctl_common_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+
+ /*
+ * Destroy the vm object and flush associated pages.
+ */
+ vnode_destroy_vobject(vp);
+ VI_LOCK(vp);
+ vp->v_data = NULL;
+ VI_UNLOCK(vp);
+ return (0);
+}
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
+ */
+
+#define ZFSCTL_INO_SNAP(id) (id)
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_root_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+ ZFS_ENTER(zfsvfs);
+ vap->va_nodeid = ZFSCTL_INO_ROOT;
+ vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+
+ zfsctl_common_getattr(vp, vap);
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(nm, "..") == 0) {
+ err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
+ if (err == 0)
+ VOP_UNLOCK(*vpp, 0, curthread);
+ } else {
+ err = gfs_dir_lookup(dvp, nm, vpp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (err);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup_vop(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ int flags = ap->a_cnp->cn_flags;
+ int nameiop = ap->a_cnp->cn_nameiop;
+ char nm[NAME_MAX + 1];
+ int err;
+
+ if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
+ return (EOPNOTSUPP);
+
+ ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+
+ err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
+ if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+
+ return (err);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_root_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = gfs_vop_readdir,
+ .vop_lookup = zfsctl_root_lookup_vop,
+ .vop_inactive = gfs_vop_inactive,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+};
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+ dmu_objset_name(os, zname);
+ if (strlen(zname) + 1 + strlen(name) >= len)
+ return (ENAMETOOLONG);
+ (void) strcat(zname, "@");
+ (void) strcat(zname, name);
+ return (0);
+}
+
+static int
+zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ zfs_snapentry_t search, *sep;
+ struct vop_inactive_args ap;
+ avl_index_t where;
+ int err;
+
+ ASSERT(MUTEX_HELD(&sdp->sd_lock));
+
+ search.se_name = (char *)name;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
+ return (ENOENT);
+
+ ASSERT(vn_ismntpt(sep->se_root));
+
+ /* this will be dropped by dounmount() */
+ if ((err = vn_vfswlock(sep->se_root)) != 0)
+ return (err);
+
+ err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
+ if (err)
+ return (err);
+ ASSERT(sep->se_root->v_count == 1);
+ ap.a_vp = sep->se_root;
+ gfs_vop_inactive(&ap);
+
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ return (0);
+}
+
+#if 0
+static void
+zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+{
+ avl_index_t where;
+ vfs_t *vfsp;
+ refstr_t *pathref;
+ char newpath[MAXNAMELEN];
+ char *tail;
+
+ ASSERT(MUTEX_HELD(&sdp->sd_lock));
+ ASSERT(sep != NULL);
+
+ vfsp = vn_mountedvfs(sep->se_root);
+ ASSERT(vfsp != NULL);
+
+ vfs_lock_wait(vfsp);
+
+ /*
+ * Change the name in the AVL tree.
+ */
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+ (void) strcpy(sep->se_name, nm);
+ VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
+ avl_insert(&sdp->sd_snaps, sep, where);
+
+ /*
+ * Change the current mountpoint info:
+ * - update the tail of the mntpoint path
+ * - update the tail of the resource path
+ */
+ pathref = vfs_getmntpoint(vfsp);
+ (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
+ VERIFY((tail = strrchr(newpath, '/')) != NULL);
+ *(tail+1) = '\0';
+ ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
+ (void) strcat(newpath, nm);
+ refstr_rele(pathref);
+ vfs_setmntpoint(vfsp, newpath);
+
+ pathref = vfs_getresource(vfsp);
+ (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
+ VERIFY((tail = strrchr(newpath, '@')) != NULL);
+ *(tail+1) = '\0';
+ ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
+ (void) strcat(newpath, nm);
+ refstr_rele(pathref);
+ vfs_setresource(vfsp, newpath);
+
+ vfs_unlock(vfsp);
+}
+#endif
+
+#if 0
+static int
+zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
+ cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = sdvp->v_data;
+ zfs_snapentry_t search, *sep;
+ avl_index_t where;
+ char from[MAXNAMELEN], to[MAXNAMELEN];
+ int err;
+
+ err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
+ if (err)
+ return (err);
+ err = zfs_secpolicy_write(from, cr);
+ if (err)
+ return (err);
+
+ /*
+ * Cannot move snapshots out of the snapdir.
+ */
+ if (sdvp != tdvp)
+ return (EINVAL);
+
+ if (strcmp(snm, tnm) == 0)
+ return (0);
+
+ err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
+ if (err)
+ return (err);
+
+ mutex_enter(&sdp->sd_lock);
+
+ search.se_name = (char *)snm;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
+ mutex_exit(&sdp->sd_lock);
+ return (ENOENT);
+ }
+
+ err = dmu_objset_rename(from, to);
+ if (err == 0)
+ zfsctl_rename_snap(sdp, sep, tnm);
+
+ mutex_exit(&sdp->sd_lock);
+
+ return (err);
+}
+#endif
+
+#if 0
+/* ARGSUSED */
+static int
+zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ char snapname[MAXNAMELEN];
+ int err;
+
+ err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
+ if (err)
+ return (err);
+ err = zfs_secpolicy_write(snapname, cr);
+ if (err)
+ return (err);
+
+ mutex_enter(&sdp->sd_lock);
+
+ err = zfsctl_unmount_snap(dvp, name, 0, cr);
+ if (err) {
+ mutex_exit(&sdp->sd_lock);
+ return (err);
+ }
+
+ err = dmu_objset_destroy(snapname);
+
+ mutex_exit(&sdp->sd_lock);
+
+ return (err);
+}
+#endif
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ char nm[NAME_MAX + 1];
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ objset_t *snap;
+ char snapname[MAXNAMELEN];
+ char *mountpoint;
+ zfs_snapentry_t *sep, search;
+ size_t mountpoint_len;
+ avl_index_t where;
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
+ return (0);
+
+ *vpp = NULL;
+
+ /*
+ * If we get a recursive call, that means we got called
+ * from the domount() code while it was trying to look up the
+ * spec (which looks like a local path for zfs). We need to
+ * add some flag to domount() to tell it not to do this lookup.
+ */
+ if (MUTEX_HELD(&sdp->sd_lock))
+ return (ENOENT);
+
+ ZFS_ENTER(zfsvfs);
+
+ mutex_enter(&sdp->sd_lock);
+ search.se_name = (char *)nm;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+ *vpp = sep->se_root;
+ VN_HOLD(*vpp);
+ if ((*vpp)->v_mountedhere == NULL) {
+ /*
+ * The snapshot was unmounted behind our backs,
+ * try to remount it.
+ */
+ goto domount;
+ }
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * The requested snapshot is not currently mounted, look it up.
+ */
+ err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
+ if (err) {
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ if (dmu_objset_open(snapname, DMU_OST_ZFS,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (ENOENT);
+ }
+
+ sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+ sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+ (void) strcpy(sep->se_name, nm);
+ *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
+ VN_HOLD(*vpp);
+ avl_insert(&sdp->sd_snaps, sep, where);
+
+ dmu_objset_close(snap);
+domount:
+ mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+ strlen("/.zfs/snapshot/") + strlen(nm) + 1;
+ mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+ (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
+ dvp->v_vfsp->mnt_stat.f_mntonname, nm);
+ err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
+ kmem_free(mountpoint, mountpoint_len);
+ /* FreeBSD: This line was moved from below to avoid a lock recursion. */
+ if (err == 0)
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ mutex_exit(&sdp->sd_lock);
+
+ /*
+ * If we had an error, drop our hold on the vnode and
+ * zfsctl_snapshot_inactive() will clean up.
+ */
+ if (err) {
+ VN_RELE(*vpp);
+ *vpp = NULL;
+ }
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
+ offset_t *offp, offset_t *nextp, void *data)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ char snapname[MAXNAMELEN];
+ uint64_t id, cookie;
+
+ ZFS_ENTER(zfsvfs);
+
+ cookie = *offp;
+ if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
+ &cookie) == ENOENT) {
+ *eofp = 1;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ (void) strcpy(dp->d_name, snapname);
+ dp->d_ino = ZFSCTL_INO_SNAP(id);
+ *nextp = cookie;
+
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+vnode_t *
+zfsctl_mknode_snapdir(vnode_t *pvp)
+{
+ vnode_t *vp;
+ zfsctl_snapdir_t *sdp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
+ &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
+ zfsctl_snapdir_readdir_cb, NULL);
+ sdp = vp->v_data;
+ sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
+ sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
+ mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&sdp->sd_snaps, snapentry_compare,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
+ return (vp);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_snapdir_t *sdp = vp->v_data;
+
+ ZFS_ENTER(zfsvfs);
+ zfsctl_common_getattr(vp, vap);
+ vap->va_nodeid = gfs_file_inode(vp);
+ vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ zfsctl_snapdir_t *sdp = vp->v_data;
+ void *private;
+
+ private = gfs_dir_inactive(vp);
+ if (private != NULL) {
+ ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
+ mutex_destroy(&sdp->sd_lock);
+ avl_destroy(&sdp->sd_snaps);
+ kmem_free(private, sizeof (zfsctl_snapdir_t));
+ }
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_snapdir_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = gfs_vop_readdir,
+ .vop_lookup = zfsctl_snapdir_lookup,
+ .vop_inactive = zfsctl_snapdir_inactive,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+};
+
+static vnode_t *
+zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
+{
+ vnode_t *vp;
+ zfsctl_node_t *zcp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
+ &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
+ zcp = vp->v_data;
+ zcp->zc_id = objset;
+
+ return (vp);
+}
+
+static int
+zfsctl_snapshot_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ struct vop_inactive_args iap;
+ zfsctl_snapdir_t *sdp;
+ zfs_snapentry_t *sep, *next;
+ int locked;
+ vnode_t *dvp;
+
+ VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
+ sdp = dvp->v_data;
+ VOP_UNLOCK(dvp, 0, ap->a_td);
+
+ if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
+ mutex_enter(&sdp->sd_lock);
+
+ if (vp->v_count > 1) {
+ if (!locked)
+ mutex_exit(&sdp->sd_lock);
+ return (0);
+ }
+ ASSERT(!vn_ismntpt(vp));
+
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+ if (sep->se_root == vp) {
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+ break;
+ }
+ sep = next;
+ }
+ ASSERT(sep != NULL);
+
+ if (!locked)
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ /*
+ * Dispose of the vnode for the snapshot mount point.
+ * This is safe to do because once this entry has been removed
+ * from the AVL tree, it can't be found again, so cannot become
+ * "active". If we lookup the same name again we will end up
+ * creating a new vnode.
+ */
+ iap.a_vp = vp;
+ return (gfs_vop_inactive(&iap));
+}
+
+static int
+zfsctl_traverse_begin(vnode_t **vpp, kthread_t *td)
+{
+ int err;
+
+ VN_HOLD(*vpp);
+ /* Snapshot should be already mounted, but just in case. */
+ if (vn_mountedvfs(*vpp) == NULL)
+ return (ENOENT);
+ err = traverse(vpp);
+ if (err == 0)
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ return (err);
+}
+
+static void
+zfsctl_traverse_end(vnode_t *vp, int err)
+{
+
+ if (err == 0)
+ vput(vp);
+ else
+ VN_RELE(vp);
+}
+
+static int
+zfsctl_snapshot_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ int err;
+
+ err = zfsctl_traverse_begin(&vp, ap->a_td);
+ if (err == 0)
+ err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
+ zfsctl_traverse_end(vp, err);
+ return (err);
+}
+
+static int
+zfsctl_snapshot_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ int err;
+
+ err = zfsctl_traverse_begin(&vp, curthread);
+ if (err == 0)
+ err = VOP_VPTOFH(vp, (void *)ap->a_fid);
+ zfsctl_traverse_end(vp, err);
+ return (err);
+}
+
+/*
+ * These VP's should never see the light of day. They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfsctl_snapshot_inactive,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_getattr = zfsctl_snapshot_getattr,
+ .vop_fid = zfsctl_snapshot_fid,
+};
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *dvp, *vp;
+ zfsctl_snapdir_t *sdp;
+ zfsctl_node_t *zcp;
+ zfs_snapentry_t *sep;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+ NULL, 0, NULL, kcred);
+ if (error != 0)
+ return (error);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ vp = sep->se_root;
+ zcp = vp->v_data;
+ if (zcp->zc_id == objsetid)
+ break;
+
+ sep = AVL_NEXT(&sdp->sd_snaps, sep);
+ }
+
+ if (sep != NULL) {
+ VN_HOLD(vp);
+ error = traverse(&vp);
+ if (error == 0) {
+ if (vp == sep->se_root)
+ error = EINVAL;
+ else
+ *zfsvfsp = VTOZ(vp)->z_zfsvfs;
+ }
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(vp);
+ } else {
+ error = EINVAL;
+ mutex_exit(&sdp->sd_lock);
+ }
+
+ VN_RELE(dvp);
+
+ return (error);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem. This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+ struct vop_inactive_args ap;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *dvp, *svp;
+ zfsctl_snapdir_t *sdp;
+ zfs_snapentry_t *sep, *next;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+ NULL, 0, NULL, cr);
+ if (error != 0)
+ return (error);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ svp = sep->se_root;
+ next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+ /*
+ * If this snapshot is not mounted, then it must
+ * have just been unmounted by somebody else, and
+ * will be cleaned up by zfsctl_snapdir_inactive().
+ */
+ if (vn_ismntpt(svp)) {
+ if ((error = vn_vfswlock(svp)) != 0)
+ goto out;
+
+ /*
+ * Increase usecount, so dounmount() won't vrele() it
+ * to 0 and call zfsctl_snapdir_inactive().
+ */
+ VN_HOLD(svp);
+ vfsp = vn_mountedvfs(svp);
+ mtx_lock(&Giant);
+ error = dounmount(vfsp, fflags, curthread);
+ mtx_unlock(&Giant);
+ if (error != 0) {
+ VN_RELE(svp);
+ goto out;
+ }
+
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ /*
+ * We can't use VN_RELE(), as that will try to
+ * invoke zfsctl_snapdir_inactive(), and that
+ * would lead to an attempt to re-grab the sd_lock.
+ */
+ ASSERT3U(svp->v_count, ==, 1);
+ ap.a_vp = svp;
+ gfs_vop_inactive(&ap);
+ }
+ sep = next;
+ }
+out:
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ return (error);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
new file mode 100644
index 000000000000..486aa74bbc44
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/random.h>
+#include <sys/kcondvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnlc.h>
+
+/*
+ * Lock a directory entry. A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object. As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZSHARED: allow concurrent access with other ZSHARED callers.
+ * ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ * dlpp - pointer to the dirlock for this entry (NULL on error)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+ int flag)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_dirlock_t *dl;
+ uint64_t zoid;
+ int error;
+ vnode_t *vp;
+
+ *zpp = NULL;
+ *dlpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if (name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+ zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+ return (EEXIST);
+
+ /*
+ * Wait until there are no locks on this name.
+ */
+ rw_enter(&dzp->z_name_lock, RW_READER);
+ mutex_enter(&dzp->z_lock);
+ for (;;) {
+ if (dzp->z_unlinked) {
+ mutex_exit(&dzp->z_lock);
+ rw_exit(&dzp->z_name_lock);
+ return (ENOENT);
+ }
+ for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
+ if (strcmp(name, dl->dl_name) == 0)
+ break;
+ if (dl == NULL) {
+ /*
+ * Allocate a new dirlock and add it to the list.
+ */
+ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+ cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+ dl->dl_name = name;
+ dl->dl_sharecnt = 0;
+ dl->dl_namesize = 0;
+ dl->dl_dzp = dzp;
+ dl->dl_next = dzp->z_dirlocks;
+ dzp->z_dirlocks = dl;
+ break;
+ }
+ if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+ break;
+ cv_wait(&dl->dl_cv, &dzp->z_lock);
+ }
+
+ if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+ /*
+ * We're the second shared reference to dl. Make a copy of
+ * dl_name in case the first thread goes away before we do.
+ * Note that we initialize the new name before storing its
+ * pointer into dl_name, because the first thread may load
+ * dl->dl_name at any time. He'll either see the old value,
+ * which is his, or the new shared copy; either is OK.
+ */
+ dl->dl_namesize = strlen(dl->dl_name) + 1;
+ name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+ bcopy(dl->dl_name, name, dl->dl_namesize);
+ dl->dl_name = name;
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * We have a dirlock on the name. (Note that it is the dirlock,
+ * not the dzp's z_lock, that protects the name in the zap object.)
+ * See if there's an object by this name; if so, put a hold on it.
+ */
+ if (flag & ZXATTR) {
+ zoid = dzp->z_phys->zp_xattr;
+ error = (zoid == 0 ? ENOENT : 0);
+ } else {
+ vp = dnlc_lookup(ZTOV(dzp), name);
+ if (vp == DNLC_NO_VNODE) {
+ VN_RELE(vp);
+ error = ENOENT;
+ } else if (vp) {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ VN_RELE(vp);
+ return (EEXIST);
+ }
+ *dlpp = dl;
+ *zpp = VTOZ(vp);
+ return (0);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
+ 8, 1, &zoid);
+ zoid = ZFS_DIRENT_OBJ(zoid);
+ if (error == ENOENT)
+ dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
+ }
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ return (EEXIST);
+ }
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ if (!(flag & ZXATTR))
+ dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+ }
+
+ *dlpp = dl;
+
+ return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfs_dirlock_t **prev_dl, *cur_dl;
+
+ mutex_enter(&dzp->z_lock);
+ rw_exit(&dzp->z_name_lock);
+ if (dl->dl_sharecnt > 1) {
+ dl->dl_sharecnt--;
+ mutex_exit(&dzp->z_lock);
+ return;
+ }
+ prev_dl = &dzp->z_dirlocks;
+ while ((cur_dl = *prev_dl) != dl)
+ prev_dl = &cur_dl->dl_next;
+ *prev_dl = dl->dl_next;
+ cv_broadcast(&dl->dl_cv);
+ mutex_exit(&dzp->z_lock);
+
+ if (dl->dl_namesize != 0)
+ kmem_free(dl->dl_name, dl->dl_namesize);
+ cv_destroy(&dl->dl_cv);
+ kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ * no directory entries are actually stored for them. If this is
+ * the root of a filesystem, then '.zfs' is also treated as a
+ * special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
+{
+ zfs_dirlock_t *dl;
+ znode_t *zp;
+ int error = 0;
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *vpp = ZTOV(dzp);
+ VN_HOLD(*vpp);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (dzp->z_phys->zp_parent == dzp->z_id &&
+ zfsvfs->z_parent != zfsvfs) {
+ error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+ "snapshot", vpp, NULL, 0, NULL, kcred);
+ return (error);
+ }
+ rw_enter(&dzp->z_parent_lock, RW_READER);
+ error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+ rw_exit(&dzp->z_parent_lock);
+ } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+ *vpp = zfsctl_root(dzp);
+ } else {
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
+ if (error == 0) {
+ *vpp = ZTOV(zp);
+ zfs_dirent_unlock(dl);
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ }
+ }
+
+ return (error);
+}
+
+static char *
+zfs_unlinked_hexname(char namebuf[17], uint64_t x)
+{
+ char *name = &namebuf[16];
+ const char digits[16] = "0123456789abcdef";
+
+ *name = '\0';
+ do {
+ *--name = digits[x & 0xf];
+ x >>= 4;
+ } while (x != 0);
+
+ return (name);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ char obj_name[17];
+ int error;
+
+ ASSERT(zp->z_unlinked);
+ ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+ error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
+ ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ int error;
+
+ /*
+ * Interate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ zp->z_unlinked = B_TRUE;
+ VN_RELE(ZTOV(zp));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_dirlock_t dl;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ ASSERT3U(error, ==, 0);
+
+ ASSERT((ZTOV(xzp)->v_type == VREG) ||
+ (ZTOV(xzp)->v_type == VLNK));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ VN_RELE(ZTOV(xzp));
+ skipped += 1;
+ continue;
+ }
+ bzero(&dl, sizeof (dl));
+ dl.dl_dzp = dzp;
+ dl.dl_name = zap.za_name;
+
+ error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_commit(tx);
+
+ VN_RELE(ZTOV(xzp));
+ }
+ zap_cursor_fini(&zc);
+ ASSERT(error == ENOENT);
+ return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *xzp = NULL;
+ char obj_name[17];
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ int error;
+ int vfslocked;
+
+ vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
+
+ ASSERT(zp->z_phys->zp_links == 0);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+ (zp->z_phys->zp_flags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it on the unlinked set.
+ */
+ VFS_UNLOCK_GIANT(vfslocked);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ if (zp->z_phys->zp_xattr) {
+ error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ ASSERT(error == 0);
+ }
+
+ acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+ /*
+ * Set up the transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xzp) {
+ dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ }
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return;
+ }
+
+ if (xzp) {
+ dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
+ xzp->z_phys->zp_links = 0; /* no more links to it */
+ mutex_exit(&xzp->z_lock);
+ zfs_unlinked_add(xzp, tx);
+ }
+
+ /* Remove this znode from the unlinked set */
+ error = zap_remove(os, zfsvfs->z_unlinkedobj,
+ zfs_unlinked_hexname(obj_name, zp->z_id), tx);
+ ASSERT3U(error, ==, 0);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+
+ if (xzp)
+ VN_RELE(ZTOV(xzp));
+ VFS_UNLOCK_GIANT(vfslocked);
+}
+
+/*
+ * Link zp into dl. Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+ znode_t *dzp = dl->dl_dzp;
+ vnode_t *vp = ZTOV(zp);
+ uint64_t value;
+ int zp_is_dir = (vp->v_type == VDIR);
+ int error;
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ mutex_enter(&zp->z_lock);
+
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ mutex_exit(&zp->z_lock);
+ return (ENOENT);
+ }
+ zp->z_phys->zp_links++;
+ }
+ zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
+
+ if (!(flag & ZNEW))
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ mutex_exit(&zp->z_lock);
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+ mutex_enter(&dzp->z_lock);
+ dzp->z_phys->zp_size++; /* one dirent added */
+ dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
+ zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * MacOS X will fill in the 4-bit object type here.
+ */
+ value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id);
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+ 8, 1, &value, tx);
+ ASSERT(error == 0);
+
+ dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+
+ return (0);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+ boolean_t *unlinkedp)
+{
+ znode_t *dzp = dl->dl_dzp;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ boolean_t unlinked = B_FALSE;
+ int error;
+
+ dnlc_remove(ZTOV(dzp), dl->dl_name);
+
+ if (!(flag & ZRENAMING)) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ if (vn_vfswlock(vp)) /* prevent new mounts on zp */
+ return (EBUSY);
+
+ if (vn_ismntpt(vp)) { /* don't remove mount point */
+ vn_vfsunlock(vp);
+ return (EBUSY);
+ }
+
+ mutex_enter(&zp->z_lock);
+ if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ return (ENOTEMPTY);
+ }
+ if (zp->z_phys->zp_links <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on vnode %p is %u, "
+ "should be at least %u", zp->z_vnode,
+ (int)zp->z_phys->zp_links,
+ zp_is_dir + 1);
+ zp->z_phys->zp_links = zp_is_dir + 1;
+ }
+ if (--zp->z_phys->zp_links == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ zp->z_phys->zp_links = 0;
+ unlinked = B_TRUE;
+ } else {
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ }
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ }
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+ mutex_enter(&dzp->z_lock);
+ dzp->z_phys->zp_size--; /* one dirent removed */
+ dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
+ zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ mutex_exit(&dzp->z_lock);
+
+ error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
+ ASSERT(error == 0);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty. Works with or without z_lock
+ * held, but can only be consider a hint in the latter case. Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ uint64_t xoid;
+ int error;
+
+ *xvpp = NULL;
+
+ if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
+ return (error);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
+ ASSERT(xzp->z_id == xoid);
+ ASSERT(xzp->z_phys->zp_parent == zp->z_id);
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_phys->zp_xattr = xoid;
+
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
+ dmu_tx_commit(tx);
+
+ *xvpp = ZTOV(xzp);
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xzpp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ zfs_dirlock_t *dl;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xvpp = ZTOV(xzp);
+ zfs_dirent_unlock(dl);
+ return (0);
+ }
+
+ ASSERT(zp->z_phys->zp_xattr == 0);
+
+#ifdef TODO
+ if (!(flags & CREATE_XATTR_DIR)) {
+ zfs_dirent_unlock(dl);
+ return (ENOENT);
+ }
+#endif
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ zfs_dirent_unlock(dl);
+ return (EROFS);
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+ va.va_type = VDIR;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ va.va_uid = (uid_t)zp->z_phys->zp_uid;
+ va.va_gid = (gid_t)zp->z_phys->zp_gid;
+
+ error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+ zfs_dirent_unlock(dl);
+
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * the entry is a plain file and you have write access,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+
+ if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
+ return (0);
+
+ if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
+ (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
+ uid == zp->z_phys->zp_uid ||
+ (ZTOV(zp)->v_type == VREG &&
+ zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
+ return (0);
+ else
+ return (secpolicy_vnode_remove(cr));
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
new file mode 100644
index 000000000000..af765ba3f9a4
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -0,0 +1,336 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+
+#ifdef _KERNEL
+/* Including sys/bus.h is just too hard, so I declare what I need here. */
+extern void devctl_notify(const char *__system, const char *__subsystem,
+ const char *__type, const char *__data);
+#endif
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports. The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * EREPORT POOL VDEV IO
+ * block X X X
+ * data X X
+ * device X X
+ * pool X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA.
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
+ * to chain together all ereports associated with a logical piece of data. For
+ * read I/Os, there are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ * +---------------+
+ * | Aggregate I/O | No associated logical data or device
+ * +---------------+
+ * |
+ * V
+ * +---------------+ Reads associated with a piece of logical data.
+ * | Read I/O | This includes reads on behalf of RAID-Z,
+ * +---------------+ mirrors, gang blocks, retries, etc.
+ * |
+ * V
+ * +---------------+ Reads associated with a particular device, but
+ * | Physical I/O | no logical data. Issued as part of vdev caching
+ * +---------------+ and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO. Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer. But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs. They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data. When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself. Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA. If a physical I/O is issued (by passing the
+ * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
+ * unique ENA will be generated. For an aggregate I/O, this pointer is set to
+ * NULL, and no ereport will be generated (since it doesn't actually correspond
+ * to any particular device or piece of data).
+ */
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ char buf[1024];
+ char class[64];
+ struct sbuf sb;
+ struct timespec ts;
+
+ /*
+ * If we are doing a spa_tryimport(), ignore errors.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return;
+
+ /*
+ * Ignore any errors from I/Os that we are going to retry anyway - we
+ * only generate errors from the final failure.
+ */
+ if (zio && zio_should_retry(zio))
+ return;
+
+ /*
+ * If this is not a read or write zio, ignore the error. This can occur
+ * if the DKIOCFLUSHWRITECACHE ioctl fails.
+ */
+ if (zio && zio->io_type != ZIO_TYPE_READ &&
+ zio->io_type != ZIO_TYPE_WRITE)
+ return;
+
+ nanotime(&ts);
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_printf(&sb, "time %ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+
+ /*
+ * Serialize ereport generation
+ */
+ mutex_enter(&spa->spa_errlist_lock);
+
+#if 0
+ /*
+ * Determine the ENA to use for this event. If we are in a loading
+ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
+ * a root zio-wide ENA. Otherwise, simply use a unique ENA.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE) {
+#if 0
+ if (spa->spa_ena == 0)
+ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+#endif
+ ena = spa->spa_ena;
+ } else if (zio != NULL && zio->io_logical != NULL) {
+#if 0
+ if (zio->io_logical->io_ena == 0)
+ zio->io_logical->io_ena =
+ fm_ena_generate(0, FM_ENA_FMT1);
+#endif
+ ena = zio->io_logical->io_ena;
+ } else {
+#if 0
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+#else
+ ena = 0;
+#endif
+ }
+#endif
+
+ /*
+ * Construct the full class, detector, and other standard FMA fields.
+ */
+ sbuf_printf(&sb, " ereport_version %u", FM_EREPORT_VERSION);
+ snprintf(class, sizeof(class), "%s.%s", ZFS_ERROR_CLASS, subclass);
+ sbuf_printf(&sb, " class %s", class);
+
+ sbuf_printf(&sb, " zfs_scheme_version %u", FM_ZFS_SCHEME_VERSION);
+
+ /*
+ * Construct the per-ereport payload, depending on which parameters are
+ * passed in.
+ */
+
+ /*
+ * Generic payload members common to all ereports.
+ *
+ * The direct reference to spa_name is used rather than spa_name()
+ * because of the asynchronous nature of the zio pipeline. spa_name()
+ * asserts that the config lock is held in some form. This is always
+ * the case in I/O context, but because the check for RW_WRITER compares
+ * against 'curthread', we may be in an asynchronous context and blow
+ * this assert. Rather than loosen this assert, we acknowledge that all
+ * contexts in which this function is called (pool open, I/O) are safe,
+ * and dereference the name directly.
+ */
+ sbuf_printf(&sb, " %s %s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ spa_guid(spa));
+ sbuf_printf(&sb, " %s %u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
+ spa->spa_load_state);
+
+ if (vd != NULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ vd->vdev_guid);
+ sbuf_printf(&sb, " %s %s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ vd->vdev_ops->vdev_op_type);
+ if (vd->vdev_path)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
+ if (vd->vdev_devid)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
+
+ if (pvd != NULL) {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+ pvd->vdev_ops->vdev_op_type);
+ if (pvd->vdev_path)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+ pvd->vdev_path);
+ if (pvd->vdev_devid)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+ pvd->vdev_devid);
+ }
+ }
+
+ if (zio != NULL) {
+ /*
+ * Payload common to all I/Os.
+ */
+ sbuf_printf(&sb, " %s %u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ zio->io_error);
+
+ /*
+ * If the 'size' parameter is non-zero, it indicates this is a
+ * RAID-Z or other I/O where the physical offset and length are
+ * provided for us, instead of within the zio_t.
+ */
+ if (vd != NULL) {
+ if (size) {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ stateoroffset);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
+ } else {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ zio->io_offset);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ zio->io_size);
+ }
+ }
+
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zio->io_logical != NULL) {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ zio->io_logical->io_bookmark.zb_object);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ zio->io_logical->io_bookmark.zb_level);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ zio->io_logical->io_bookmark.zb_blkid);
+ }
+ } else if (vd != NULL) {
+ /*
+ * If we have a vdev but no zio, this is a device fault, and the
+ * 'stateoroffset' parameter indicates the previous state of the
+ * vdev.
+ */
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ stateoroffset);
+ }
+ mutex_exit(&spa->spa_errlist_lock);
+
+ sbuf_finish(&sb);
+ devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
+ if (sbuf_overflowed(&sb))
+ printf("ZFS WARNING: sbuf overflowed\n");
+ sbuf_delete(&sb);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.ok' event is an internal signal that the associated
+ * resource (pool or disk) has been identified by ZFS as healthy. This will
+ * then trigger the DE to close the associated case, if any.
+ */
+void
+zfs_post_ok(spa_t *spa, vdev_t *vd)
+{
+#ifdef _KERNEL
+ char buf[1024];
+ char class[64];
+ struct sbuf sb;
+ struct timespec ts;
+
+ nanotime(&ts);
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_printf(&sb, "time %ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+
+ snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+ sbuf_printf(&sb, " %s %hhu", FM_VERSION, FM_RSRC_VERSION);
+ sbuf_printf(&sb, " %s %s", FM_CLASS, class);
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ spa_guid(spa));
+ if (vd)
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ vd->vdev_guid);
+ sbuf_finish(&sb);
+ devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
+ if (sbuf_overflowed(&sb))
+ printf("ZFS WARNING: sbuf overflowed\n");
+ sbuf_delete(&sb);
+#endif
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
new file mode 100644
index 000000000000..aac1bb1b7810
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -0,0 +1,1811 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/varargs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zvol.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
+
+static struct cdev *zfsdev;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+
+typedef int zfs_ioc_func_t(zfs_cmd_t *);
+typedef int zfs_secpolicy_func_t(const char *, cred_t *);
+
+typedef struct zfs_ioc_vec {
+ zfs_ioc_func_t *zvec_func;
+ zfs_secpolicy_func_t *zvec_secpolicy;
+ enum {
+ no_name,
+ pool_name,
+ dataset_name
+ } zvec_namecheck;
+} zfs_ioc_vec_t;
+
+/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char buf[256];
+ va_list adx;
+
+ /*
+ * Get rid of annoying "../common/" prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ /*
+ * To get this data, use the zfs-dprintf probe as so:
+ * dtrace -q -n 'zfs-dprintf \
+ * /stringof(arg0) == "dbuf.c"/ \
+ * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
+ * arg0 = file name
+ * arg1 = function name
+ * arg2 = line number
+ * arg3 = message
+ */
+ DTRACE_PROBE4(zfs__dprintf,
+ char *, newfile, char *, func, int, line, char *, buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools). Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(const char *unused1, cred_t *cr)
+{
+ return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics). Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(const char *dataset, cred_t *cr)
+{
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(dataset, NULL))
+ return (0);
+
+ return (ENOENT);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+ int writable = 1;
+
+ /*
+ * The dataset must be visible by this zone -- check this first
+ * so they don't see EPERM on something they shouldn't know about.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(dataset, &writable))
+ return (ENOENT);
+
+ if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
+ return (ENOENT);
+
+ if (INGLOBALZONE(curproc)) {
+ /*
+ * If the fs is zoned, only root can access it from the
+ * global zone.
+ */
+ if (secpolicy_zfs(cr) && zoned)
+ return (EPERM);
+ } else {
+ /*
+ * If we are in a local zone, the 'zoned' property must be set.
+ */
+ if (!zoned)
+ return (EPERM);
+
+ /* must be writable by this zone */
+ if (!writable)
+ return (EPERM);
+ }
+ return (0);
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+int
+zfs_secpolicy_write(const char *dataset, cred_t *cr)
+{
+ int error;
+
+ if (error = zfs_dozonecheck(dataset, cr))
+ return (error);
+
+ return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for operations that want to write a dataset's parent:
+ * create, destroy, snapshot, clone, restore.
+ */
+static int
+zfs_secpolicy_parent(const char *dataset, cred_t *cr)
+{
+ char parentname[MAXNAMELEN];
+ char *cp;
+
+ /*
+ * Remove the @bla or /bla from the end of the name to get the parent.
+ */
+ (void) strncpy(parentname, dataset, sizeof (parentname));
+ cp = strrchr(parentname, '@');
+ if (cp != NULL) {
+ cp[0] = '\0';
+ } else {
+ cp = strrchr(parentname, '/');
+ if (cp == NULL)
+ return (ENOENT);
+ cp[0] = '\0';
+
+ }
+
+ return (zfs_secpolicy_write(parentname, cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_config(const char *unused, cred_t *cr)
+{
+ if (secpolicy_sys_config(cr, B_FALSE) != 0)
+ return (EPERM);
+
+ return (0);
+}
+
+/*
+ * Policy for fault injection. Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(const char *unused, cred_t *cr)
+{
+ return (secpolicy_zinject(cr));
+}
+
+/*
+ * Policy for dataset backup operations (sendbackup).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+static int
+zfs_secpolicy_operator(const char *dataset, cred_t *cr)
+{
+ int writable = 1;
+
+ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
+ return (ENOENT);
+ if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
+ return (EPERM);
+ return (0);
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
+{
+ char *packed;
+ size_t size;
+ int error;
+ nvlist_t *config = NULL;
+
+ /*
+ * Read in and unpack the user-supplied nvlist.
+ */
+ if ((size = zc->zc_nvlist_src_size) == 0)
+ return (EINVAL);
+
+ packed = kmem_alloc(size, KM_SLEEP);
+
+ if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
+ size)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ kmem_free(packed, size);
+
+ *nvp = config;
+ return (0);
+}
+
+static int
+put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+ char *packed = NULL;
+ size_t size;
+ int error;
+
+ VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
+
+ if (size > zc->zc_nvlist_dst_size) {
+ /*
+ * Solaris returns ENOMEM here, because even if an error is
+ * returned from an ioctl(2), new zc_nvlist_dst_size will be
+ * passed to the userland. This is not the case for FreeBSD.
+ * We need to return 0, so the kernel will copy the
+ * zc_nvlist_dst_size back and the userland can discover that a
+ * bigger buffer is needed.
+ */
+ error = 0;
+ } else {
+ VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
+ KM_SLEEP) == 0);
+ error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ size);
+ kmem_free(packed, size);
+ }
+
+ zc->zc_nvlist_dst_size = size;
+ return (error);
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config;
+
+ if ((error = get_nvlist(zc, &config)) != 0)
+ return (error);
+
+ error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
+ NULL : zc->zc_value);
+
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+ return (spa_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config;
+ uint64_t guid;
+
+ if ((error = get_nvlist(zc, &config)) != 0)
+ return (error);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+ guid != zc->zc_guid)
+ error = EINVAL;
+ else
+ error = spa_import(zc->zc_name, config,
+ zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
+
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+ return (spa_export(zc->zc_name, NULL));
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+ nvlist_t *configs;
+ int error;
+
+ if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+ return (EEXIST);
+
+ error = put_nvlist(zc, configs);
+
+ nvlist_free(configs);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+ nvlist_t *config;
+ int error;
+ int ret = 0;
+
+ error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
+ sizeof (zc->zc_value));
+
+ if (config != NULL) {
+ ret = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ /*
+ * The config may be present even if 'error' is non-zero.
+ * In this case we return success, and preserve the real errno
+ * in 'zc_cookie'.
+ */
+ zc->zc_cookie = error;
+ } else {
+ ret = error;
+ }
+
+ return (ret);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+ nvlist_t *tryconfig, *config;
+ int error;
+
+ if ((error = get_nvlist(zc, &tryconfig)) != 0)
+ return (error);
+
+ config = spa_tryimport(tryconfig);
+
+ nvlist_free(tryconfig);
+
+ if (config == NULL)
+ return (EINVAL);
+
+ error = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ spa_freeze(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_upgrade(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_get_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *hist_buf;
+ uint64_t size;
+ int error;
+
+ if ((size = zc->zc_history_len) == 0)
+ return (EINVAL);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ hist_buf = kmem_alloc(size, KM_SLEEP);
+ if ((error = spa_history_get(spa, &zc->zc_history_offset,
+ &zc->zc_history_len, hist_buf)) == 0) {
+ error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len);
+ }
+
+ spa_close(spa, FTAG);
+ kmem_free(hist_buf, size);
+ return (error);
+}
+
+static int
+zfs_ioc_pool_log_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *history_str = NULL;
+ size_t size;
+ int error;
+
+ size = zc->zc_history_len;
+ if (size == 0 || size > HIS_MAX_RECORD_LEN)
+ return (EINVAL);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ /* add one for the NULL delimiter */
+ size++;
+ history_str = kmem_alloc(size, KM_SLEEP);
+ if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
+ size)) != 0) {
+ spa_close(spa, FTAG);
+ kmem_free(history_str, size);
+ return (error);
+ }
+ history_str[size - 1] = '\0';
+
+ error = spa_history_log(spa, history_str, zc->zc_history_offset);
+
+ spa_close(spa, FTAG);
+ kmem_free(history_str, size);
+
+ return (error);
+}
+
+static int
+zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+{
+ int error;
+
+ if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
+ return (error);
+
+ return (0);
+}
+
+static int
+zfs_ioc_obj_to_path(zfs_cmd_t *zc)
+{
+ objset_t *osp;
+ int error;
+
+ if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
+ DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
+ return (error);
+
+ error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_close(osp);
+
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *config;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ /*
+ * A root pool with concatenated devices is not supported.
+ * Thus, can not add a device to a root pool with one device.
+ */
+ if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
+ spa_close(spa, FTAG);
+ return (EDOM);
+ }
+
+ if ((error = get_nvlist(zc, &config)) == 0) {
+ error = spa_vdev_add(spa, config);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_online(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+ error = vdev_online(spa, zc->zc_guid);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_offline(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int istmp = zc->zc_cookie;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+ error = vdev_offline(spa, zc->zc_guid, istmp);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int replacing = zc->zc_cookie;
+ nvlist_t *config;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if ((error = get_nvlist(zc, &config)) == 0) {
+ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setpath(spa, guid, path);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+ nvlist_t *nv;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ return (error);
+ }
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_all(os, &nv)) == 0) {
+ dmu_objset_stats(os, nv);
+ /*
+ * NB: zvol_get_stats() will read the objset contents,
+ * which we aren't supposed to do with a
+ * DS_MODE_STANDARD open, because it could be
+ * inconsistent. So this is a bit of a workaround...
+ */
+ if (!zc->zc_objset_stats.dds_inconsistent &&
+ dmu_objset_type(os) == DMU_OST_ZVOL)
+ VERIFY(zvol_get_stats(os, nv) == 0);
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
+
+ dmu_objset_close(os);
+ if (error == ENOMEM)
+ error = 0;
+ return (error);
+}
+
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+ char *p;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ p = strrchr(zc->zc_name, '/');
+ if (p == NULL || p[1] != '\0')
+ (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+ p = zc->zc_name + strlen(zc->zc_name);
+
+ do {
+ error = dmu_dir_list_next(os,
+ sizeof (zc->zc_name) - (p - zc->zc_name), p,
+ NULL, &zc->zc_cookie);
+ if (error == ENOENT)
+ error = ESRCH;
+ } while (error == 0 && !INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(zc->zc_name, NULL));
+
+ /*
+ * If it's a hidden dataset (ie. with a '$' in its name), don't
+ * try to get stats for it. Userland will skip over it.
+ */
+ if (error == 0 && strchr(zc->zc_name, '$') == NULL)
+ error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+
+ dmu_objset_close(os);
+ return (error);
+}
+
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ /*
+ * A dataset name of maximum length cannot have any snapshots,
+ * so exit immediately.
+ */
+ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
+ dmu_objset_close(os);
+ return (ESRCH);
+ }
+
+ error = dmu_snapshot_list_next(os,
+ sizeof (zc->zc_name) - strlen(zc->zc_name),
+ zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
+ if (error == ENOENT)
+ error = ESRCH;
+
+ if (error == 0)
+ error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+
+ dmu_objset_close(os);
+ return (error);
+}
+
+static int
+zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
+{
+ nvpair_t *elem;
+ int error;
+ const char *propname;
+ zfs_prop_t prop;
+ uint64_t intval;
+ char *strval;
+ char buf[MAXNAMELEN];
+ const char *p;
+ spa_t *spa;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ propname = nvpair_name(elem);
+
+ if ((prop = zfs_name_to_prop(propname)) ==
+ ZFS_PROP_INVAL) {
+ /*
+ * If this is a user-defined property, it must be a
+ * string, and there is no further validation to do.
+ */
+ if (!zfs_prop_user(propname) ||
+ nvpair_type(elem) != DATA_TYPE_STRING)
+ return (EINVAL);
+
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ error = dsl_prop_set(name, propname, 1,
+ strlen(strval) + 1, strval);
+ if (error == 0)
+ continue;
+ else
+ return (error);
+ }
+
+ /*
+ * Check permissions for special properties.
+ */
+ switch (prop) {
+ case ZFS_PROP_ZONED:
+ /*
+ * Disallow setting of 'zoned' from within a local zone.
+ */
+ if (!INGLOBALZONE(curproc))
+ return (EPERM);
+ break;
+
+ case ZFS_PROP_QUOTA:
+ if (error = zfs_dozonecheck(name, cr))
+ return (error);
+
+ if (!INGLOBALZONE(curproc)) {
+ uint64_t zoned;
+ char setpoint[MAXNAMELEN];
+ int dslen;
+ /*
+ * Unprivileged users are allowed to modify the
+ * quota on things *under* (ie. contained by)
+ * the thing they own.
+ */
+ if (dsl_prop_get_integer(name, "jailed", &zoned,
+ setpoint))
+ return (EPERM);
+ if (!zoned) /* this shouldn't happen */
+ return (EPERM);
+ dslen = strlen(name);
+ if (dslen <= strlen(setpoint))
+ return (EPERM);
+ }
+ break;
+
+ case ZFS_PROP_COMPRESSION:
+ /*
+ * If the user specified gzip compression, make sure
+ * the SPA supports it. We ignore any errors here since
+ * we'll catch them later.
+ */
+ if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(elem, &intval) == 0 &&
+ intval >= ZIO_COMPRESS_GZIP_1 &&
+ intval <= ZIO_COMPRESS_GZIP_9) {
+ if ((p = strchr(name, '/')) == NULL) {
+ p = name;
+ } else {
+ bcopy(name, buf, p - name);
+ buf[p - name] = '\0';
+ p = buf;
+ }
+
+ if (spa_open(p, &spa, FTAG) == 0) {
+ if (spa_version(spa) <
+ ZFS_VERSION_GZIP_COMPRESSION) {
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ spa_close(spa, FTAG);
+ }
+ }
+ break;
+ }
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = dsl_dir_set_quota(name,
+ intval)) != 0)
+ return (error);
+ break;
+
+ case ZFS_PROP_RESERVATION:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = dsl_dir_set_reservation(name,
+ intval)) != 0)
+ return (error);
+ break;
+
+ case ZFS_PROP_VOLSIZE:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = zvol_set_volsize(name, dev,
+ intval)) != 0)
+ return (error);
+ break;
+
+ case ZFS_PROP_VOLBLOCKSIZE:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = zvol_set_volblocksize(name,
+ intval)) != 0)
+ return (error);
+ break;
+
+ default:
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ if (zfs_prop_get_type(prop) !=
+ prop_type_string)
+ return (EINVAL);
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ if ((error = dsl_prop_set(name,
+ nvpair_name(elem), 1, strlen(strval) + 1,
+ strval)) != 0)
+ return (error);
+ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ const char *unused;
+
+ VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+
+ switch (zfs_prop_get_type(prop)) {
+ case prop_type_number:
+ break;
+ case prop_type_boolean:
+ if (intval > 1)
+ return (EINVAL);
+ break;
+ case prop_type_string:
+ return (EINVAL);
+ case prop_type_index:
+ if (zfs_prop_index_to_string(prop,
+ intval, &unused) != 0)
+ return (EINVAL);
+ break;
+ default:
+ cmn_err(CE_PANIC, "unknown property "
+ "type");
+ break;
+ }
+
+ if ((error = dsl_prop_set(name, propname,
+ 8, 1, &intval)) != 0)
+ return (error);
+ } else {
+ return (EINVAL);
+ }
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+ nvlist_t *nvl;
+ int error;
+ zfs_prop_t prop;
+
+ /*
+ * If zc_value is set, then this is an attempt to inherit a value.
+ * Otherwise, zc_nvlist refers to a list of properties to set.
+ */
+ if (zc->zc_value[0] != '\0') {
+ if (!zfs_prop_user(zc->zc_value) &&
+ ((prop = zfs_name_to_prop(zc->zc_value)) ==
+ ZFS_PROP_INVAL ||
+ !zfs_prop_inheritable(prop)))
+ return (EINVAL);
+
+ return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+ }
+
+ if ((error = get_nvlist(zc, &nvl)) != 0)
+ return (error);
+
+ error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
+ (cred_t *)(uintptr_t)zc->zc_cred, nvl);
+ nvlist_free(nvl);
+ return (error);
+}
+
+static int
+zfs_ioc_pool_props_set(zfs_cmd_t *zc)
+{
+ nvlist_t *nvl;
+ int error, reset_bootfs = 0;
+ uint64_t objnum;
+ zpool_prop_t prop;
+ nvpair_t *elem;
+ char *propname, *strval;
+ spa_t *spa;
+ vdev_t *rvdev;
+ char *vdev_type;
+ objset_t *os;
+
+ if ((error = get_nvlist(zc, &nvl)) != 0)
+ return (error);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+ nvlist_free(nvl);
+ return (error);
+ }
+
+ if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
+ nvlist_free(nvl);
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+
+ propname = nvpair_name(elem);
+
+ if ((prop = zpool_name_to_prop(propname)) ==
+ ZFS_PROP_INVAL) {
+ nvlist_free(nvl);
+ spa_close(spa, FTAG);
+ return (EINVAL);
+ }
+
+ switch (prop) {
+ case ZFS_PROP_BOOTFS:
+ /*
+ * A bootable filesystem can not be on a RAIDZ pool
+ * nor a striped pool with more than 1 device.
+ */
+ rvdev = spa->spa_root_vdev;
+ vdev_type =
+ rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
+ if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
+ (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
+ rvdev->vdev_children > 1)) {
+ error = ENOTSUP;
+ break;
+ }
+
+ reset_bootfs = 1;
+
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ if (strval == NULL || strval[0] == '\0') {
+ objnum =
+ zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
+ break;
+ }
+
+ if (error = dmu_objset_open(strval, DMU_OST_ZFS,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os))
+ break;
+ objnum = dmu_objset_id(os);
+ dmu_objset_close(os);
+ break;
+
+ default:
+ error = EINVAL;
+ }
+
+ if (error)
+ break;
+ }
+ if (error == 0) {
+ if (reset_bootfs) {
+ VERIFY(nvlist_remove(nvl,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS),
+ DATA_TYPE_STRING) == 0);
+ VERIFY(nvlist_add_uint64(nvl,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
+ }
+ error = spa_set_props(spa, nvl);
+ }
+
+ nvlist_free(nvl);
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_props_get(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvp = NULL;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_props(spa, &nvp);
+
+ if (error == 0 && zc->zc_nvlist_dst != 0)
+ error = put_nvlist(zc, nvp);
+ else
+ error = EFAULT;
+
+ spa_close(spa, FTAG);
+
+ if (nvp)
+ nvlist_free(nvp);
+ return (error);
+}
+
+static int
+zfs_ioc_create_minor(zfs_cmd_t *zc)
+{
+ return (zvol_create_minor(zc->zc_name, zc->zc_dev));
+}
+
+static int
+zfs_ioc_remove_minor(zfs_cmd_t *zc)
+{
+ return (zvol_remove_minor(zc->zc_name));
+}
+
+/*
+ * Search the vfs list for a specified resource. Returns a pointer to it
+ * or NULL if no suitable entry is found. The caller of this routine
+ * is responsible for releasing the returned vfs pointer.
+ */
+static vfs_t *
+zfs_get_vfs(const char *resource)
+{
+ vfs_t *vfsp;
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
+ if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
+ VFS_HOLD(vfsp);
+ break;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ return (vfsp);
+}
+
+static void
+zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ zfs_create_data_t *zc = arg;
+
+ zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
+}
+
+static int
+zfs_ioc_create(zfs_cmd_t *zc)
+{
+ objset_t *clone;
+ int error = 0;
+ zfs_create_data_t cbdata = { 0 };
+ void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ dmu_objset_type_t type = zc->zc_objset_type;
+
+ switch (type) {
+
+ case DMU_OST_ZFS:
+ cbfunc = zfs_create_cb;
+ break;
+
+ case DMU_OST_ZVOL:
+ cbfunc = zvol_create_cb;
+ break;
+
+ default:
+ cbfunc = NULL;
+ }
+ if (strchr(zc->zc_name, '@'))
+ return (EINVAL);
+
+ if (zc->zc_nvlist_src != 0 &&
+ (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
+ return (error);
+
+ cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
+ cbdata.zc_dev = (dev_t)zc->zc_dev;
+
+ if (zc->zc_value[0] != '\0') {
+ /*
+ * We're creating a clone of an existing snapshot.
+ */
+ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ error = dmu_objset_open(zc->zc_value, type,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
+ if (error) {
+ nvlist_free(cbdata.zc_props);
+ return (error);
+ }
+ error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
+ dmu_objset_close(clone);
+ } else {
+ if (cbfunc == NULL) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ if (type == DMU_OST_ZVOL) {
+ uint64_t volsize, volblocksize;
+
+ if (cbdata.zc_props == NULL ||
+ nvlist_lookup_uint64(cbdata.zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+ &volsize) != 0) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ if ((error = nvlist_lookup_uint64(cbdata.zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ &volblocksize)) != 0 && error != ENOENT) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ if (error != 0)
+ volblocksize = zfs_prop_default_numeric(
+ ZFS_PROP_VOLBLOCKSIZE);
+
+ if ((error = zvol_check_volblocksize(
+ volblocksize)) != 0 ||
+ (error = zvol_check_volsize(volsize,
+ volblocksize)) != 0) {
+ nvlist_free(cbdata.zc_props);
+ return (error);
+ }
+ }
+
+ error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
+ &cbdata);
+ }
+
+ /*
+ * It would be nice to do this atomically.
+ */
+ if (error == 0) {
+ if ((error = zfs_set_prop_nvlist(zc->zc_name,
+ zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
+ cbdata.zc_props)) != 0)
+ (void) dmu_objset_destroy(zc->zc_name);
+ }
+
+ nvlist_free(cbdata.zc_props);
+ return (error);
+}
+
+static int
+zfs_ioc_snapshot(zfs_cmd_t *zc)
+{
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+ return (dmu_objset_snapshot(zc->zc_name,
+ zc->zc_value, zc->zc_cookie));
+}
+
+static int
+zfs_unmount_snap(char *name, void *arg)
+{
+ char *snapname = arg;
+ char *cp;
+ vfs_t *vfsp = NULL;
+
+ /*
+ * Snapshots (which are under .zfs control) must be unmounted
+ * before they can be destroyed.
+ */
+
+ if (snapname) {
+ (void) strcat(name, "@");
+ (void) strcat(name, snapname);
+ vfsp = zfs_get_vfs(name);
+ cp = strchr(name, '@');
+ *cp = '\0';
+ } else if (strchr(name, '@')) {
+ vfsp = zfs_get_vfs(name);
+ }
+
+ if (vfsp) {
+ /*
+ * Always force the unmount for snapshots.
+ */
+ int flag = MS_FORCE;
+ int err;
+
+ if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+ VFS_RELE(vfsp);
+ mtx_lock(&Giant); /* dounmount() */
+ dounmount(vfsp, flag, curthread);
+ mtx_unlock(&Giant); /* dounmount() */
+ }
+ return (0);
+}
+
+static int
+zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
+{
+ int err;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+ err = dmu_objset_find(zc->zc_name,
+ zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
+ if (err)
+ return (err);
+ return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+}
+
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+ if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
+ int err = zfs_unmount_snap(zc->zc_name, NULL);
+ if (err)
+ return (err);
+ }
+
+ return (dmu_objset_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_rollback(zfs_cmd_t *zc)
+{
+ return (dmu_objset_rollback(zc->zc_name));
+}
+
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ if (strchr(zc->zc_name, '@') != NULL &&
+ zc->zc_objset_type == DMU_OST_ZFS) {
+ int err = zfs_unmount_snap(zc->zc_name, NULL);
+ if (err)
+ return (err);
+ }
+
+ return (dmu_objset_rename(zc->zc_name, zc->zc_value));
+}
+
+static int
+zfs_ioc_recvbackup(zfs_cmd_t *zc)
+{
+ kthread_t *td = curthread;
+ struct file *fp;
+ int error;
+ offset_t new_off;
+
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_value, '@') == NULL)
+ return (EINVAL);
+
+ error = fget_read(td, zc->zc_cookie, &fp);
+ if (error)
+ return (error);
+
+ error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
+ &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
+ fp->f_offset);
+
+ new_off = fp->f_offset + zc->zc_cookie;
+ fp->f_offset = new_off;
+
+ fdrop(fp, td);
+ return (error);
+}
+
+static int
+zfs_ioc_sendbackup(zfs_cmd_t *zc)
+{
+ kthread_t *td = curthread;
+ struct file *fp;
+ objset_t *fromsnap = NULL;
+ objset_t *tosnap;
+ int error, fd;
+
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
+ if (error)
+ return (error);
+
+ if (zc->zc_value[0] != '\0') {
+ char buf[MAXPATHLEN];
+ char *cp;
+
+ (void) strncpy(buf, zc->zc_name, sizeof (buf));
+ cp = strchr(buf, '@');
+ if (cp)
+ *(cp+1) = 0;
+ (void) strlcat(buf, zc->zc_value, sizeof (buf));
+ error = dmu_objset_open(buf, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
+ if (error) {
+ dmu_objset_close(tosnap);
+ return (error);
+ }
+ }
+
+ fd = zc->zc_cookie;
+ error = fget_write(td, fd, &fp);
+ if (error) {
+ dmu_objset_close(tosnap);
+ if (fromsnap)
+ dmu_objset_close(fromsnap);
+ return (error);
+ }
+
+ error = dmu_sendbackup(tosnap, fromsnap, fp);
+
+ fdrop(fp, td);
+ if (fromsnap)
+ dmu_objset_close(fromsnap);
+ dmu_objset_close(tosnap);
+ return (error);
+}
+
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+ int id, error;
+
+ error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+ &zc->zc_inject_record);
+
+ if (error == 0)
+ zc->zc_guid = (uint64_t)id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+ return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+ int id = (int)zc->zc_guid;
+ int error;
+
+ error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+ &zc->zc_inject_record);
+
+ zc->zc_guid = id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ size_t count = (size_t)zc->zc_nvlist_dst_size;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ &count);
+ if (error == 0)
+ zc->zc_nvlist_dst_size = count;
+ else
+ zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ vdev_t *vd;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ if (zc->zc_guid == 0) {
+ vd = NULL;
+ } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
+ spa_config_exit(spa, FTAG);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+
+ vdev_clear(spa, vd);
+
+ spa_config_exit(spa, FTAG);
+
+ spa_close(spa, FTAG);
+
+ return (0);
+}
+
+static int
+zfs_ioc_promote(zfs_cmd_t *zc)
+{
+ char *cp;
+
+ /*
+ * We don't need to unmount *all* the origin fs's snapshots, but
+ * it's easier.
+ */
+ cp = strchr(zc->zc_value, '@');
+ if (cp)
+ *cp = '\0';
+ (void) dmu_objset_find(zc->zc_value,
+ zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
+ return (dsl_dataset_promote(zc->zc_name));
+}
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
+ zc->zc_name, (int)zc->zc_jailid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
+ zc->zc_name, (int)zc->zc_jailid));
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[] = {
+ { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name },
+ { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name },
+ { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_upgrade, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_get_history, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_set_prop, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_create, zfs_secpolicy_parent, dataset_name },
+ { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name },
+ { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_rename, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_sendbackup, zfs_secpolicy_operator, dataset_name },
+ { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name },
+ { zfs_ioc_clear, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_promote, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_destroy_snaps, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_snapshot, zfs_secpolicy_operator, dataset_name },
+ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_props_set, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_props_get, zfs_secpolicy_read, pool_name },
+ { zfs_ioc_jail, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_unjail, zfs_secpolicy_config, dataset_name }
+};
+
+static int
+zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
+ struct thread *td)
+{
+ zfs_cmd_t *zc = (void *)addr;
+ uint_t vec;
+ int error;
+
+ vec = ZFS_IOC(cmd);
+
+ if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+ return (EINVAL);
+
+ zc->zc_cred = (uintptr_t)td->td_ucred;
+ zc->zc_dev = (uintptr_t)dev;
+ error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
+
+ /*
+ * Ensure that all pool/dataset names are valid before we pass down to
+ * the lower layers.
+ */
+ if (error == 0) {
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ switch (zfs_ioc_vec[vec].zvec_namecheck) {
+ case pool_name:
+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = EINVAL;
+ break;
+
+ case dataset_name:
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = EINVAL;
+ break;
+
+ case no_name:
+ break;
+ }
+ }
+
+ if (error == 0)
+ error = zfs_ioc_vec[vec].zvec_func(zc);
+
+ return (error);
+}
+
+/*
+ * OK, so this is a little weird.
+ *
+ * /dev/zfs is the control node, i.e. minor 0.
+ * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
+ *
+ * /dev/zfs has basically nothing to do except serve up ioctls,
+ * so most of the standard driver entry points are in zvol.c.
+ */
+static struct cdevsw zfs_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = zfsdev_ioctl,
+ .d_name = ZFS_DEV_NAME
+};
+
+static void
+zfsdev_init(void)
+{
+ zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
+ ZFS_DEV_NAME);
+}
+
+static void
+zfsdev_fini(void)
+{
+ if (zfsdev != NULL)
+ destroy_dev(zfsdev);
+}
+
+static struct task zfs_start_task;
+
+static void
+zfs_start(void *context __unused, int pending __unused)
+{
+
+ zfsdev_init();
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+ printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
+}
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+ int error;
+
+ error = EOPNOTSUPP;
+ switch (type) {
+ case MOD_LOAD:
+ printf("WARNING: ZFS is considered to be an experimental "
+ "feature in FreeBSD.\n");
+ TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
+ taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
+ error = 0;
+ break;
+ case MOD_UNLOAD:
+ if (spa_busy() || /* zfs_busy() || */ zvol_busy() ||
+ zio_injection_enabled) {
+ error = EBUSY;
+ break;
+ }
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ zfsdev_fini();
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t zfs_mod = {
+ "zfsctrl",
+ zfs_modevent,
+ 0
+};
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_MOUNT_ROOT, SI_ORDER_ANY);
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
new file mode 100644
index 000000000000..06cb95a2d7ee
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -0,0 +1,348 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+/*
+ * All the functions in this file are used to construct the log entries
+ * to record transactions. They allocate * a intent log transaction
+ * structure (itx_t) and save within it all the information necessary to
+ * possibly replay the transaction. The itx is then assigned a sequence
+ * number and inserted in the in-memory list anchored in the zilog.
+ */
+
+/*
+ * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
+ * transactions.
+ */
+void
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mode = zp->z_phys->zp_mode;
+ lr->lr_uid = zp->z_phys->zp_uid;
+ lr->lr_gid = zp->z_phys->zp_gid;
+ lr->lr_gen = zp->z_phys->zp_gen;
+ lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+ lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ lr->lr_rdev = zp->z_phys->zp_rdev;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+void
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_remove_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_remove_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_link() handles TX_LINK transactions.
+ */
+void
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_link_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_link_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_link_obj = zp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_symlink() handles TX_SYMLINK transactions.
+ */
+void
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+ size_t linksize = strlen(link) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mode = zp->z_phys->zp_mode;
+ lr->lr_uid = zp->z_phys->zp_uid;
+ lr->lr_gid = zp->z_phys->zp_gid;
+ lr->lr_gen = zp->z_phys->zp_gen;
+ lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+ lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ bcopy(name, (char *)(lr + 1), namesize);
+ bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_rename() handles TX_RENAME transactions.
+ */
+void
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_rename_t *lr;
+ size_t snamesize = strlen(sname) + 1;
+ size_t dnamesize = strlen(dname) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+ lr = (lr_rename_t *)&itx->itx_lr;
+ lr->lr_sdoid = sdzp->z_id;
+ lr->lr_tdoid = tdzp->z_id;
+ bcopy(sname, (char *)(lr + 1), snamesize);
+ bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ sdzp->z_last_itx = seq;
+ tdzp->z_last_itx = seq;
+ szp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions.
+ */
+ssize_t zfs_immediate_write_sz = 32768;
+
+void
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_write_t *lr;
+ itx_wr_state_t write_state;
+ int err;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ /*
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * If the write is greater than zfs_immediate_write_sz then
+ * later *if* we need to log the write then dmu_sync() is used
+ * to immediately write the block and it's block pointer is put
+ * in the log record.
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FDSYNC (O_DSYNC)), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
+ */
+ if (len > zfs_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (ioflag & IO_SYNC)
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) +
+ (write_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (write_state == WR_COPIED) {
+ err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1);
+ if (err) {
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ write_state = WR_NEED_COPY;
+ }
+ }
+
+ itx->itx_wr_state = write_state;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zp->z_zfsvfs;
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ */
+void
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_truncate_t *lr;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_setattr() handles TX_SETATTR transactions.
+ */
+void
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_setattr_t *lr;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_setattr_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mask = (uint64_t)mask_applied;
+ lr->lr_mode = (uint64_t)vap->va_mode;
+ lr->lr_uid = (uint64_t)vap->va_uid;
+ lr->lr_gid = (uint64_t)vap->va_gid;
+ lr->lr_size = (uint64_t)vap->va_size;
+ ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_acl() handles TX_ACL transactions.
+ */
+void
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, int aclcnt, ace_t *z_ace)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_acl_t *lr;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
+ lr = (lr_acl_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_aclcnt = (uint64_t)aclcnt;
+ bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
new file mode 100644
index 000000000000..ad3ad918615d
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -0,0 +1,424 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+#include <sys/namei.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+ VATTR_NULL(vap);
+ vap->va_mask = (uint_t)mask;
+ vap->va_type = IFTOVT(mode);
+ vap->va_mode = mode & MODEMASK;
+ vap->va_uid = (uid_t)uid;
+ vap->va_gid = (gid_t)gid;
+ vap->va_rdev = zfs_cmpldev(rdev);
+ vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
+{
+ return (ENOTSUP);
+}
+
+static int
+zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_create_t */
+ char *link; /* symlink content follows name */
+ znode_t *dzp;
+ vnode_t *vp = NULL;
+ vattr_t va;
+ struct componentname cn;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time and generation number. The generic VOP_CREATE()
+ * doesn't have either concept, so we smuggle the values inside
+ * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ */
+ ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
+ va.va_nblocks = lr->lr_gen;
+
+ cn.cn_nameptr = name;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ cn.cn_flags = SAVENAME;
+
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_CREATE:
+ error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
+ break;
+ case TX_MKDIR:
+ error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
+ break;
+ case TX_MKXATTR:
+ error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
+ break;
+ case TX_SYMLINK:
+ link = name + strlen(name) + 1;
+ error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+
+ if (error == 0 && vp != NULL) {
+ VOP_UNLOCK(vp, 0, curthread);
+ VN_RELE(vp);
+ }
+
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_remove_t */
+ znode_t *dzp;
+ struct componentname cn;
+ vnode_t *vp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ cn.cn_nameptr = name;
+ cn.cn_namelen = strlen(name);
+ cn.cn_nameiop = DELETE;
+ cn.cn_flags = ISLASTCN | SAVENAME;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
+ if (error != 0) {
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+ goto fail;
+ }
+
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_REMOVE:
+ error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
+ break;
+ case TX_RMDIR:
+ error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+ vput(vp);
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+fail:
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_link_t */
+ znode_t *dzp, *zp;
+ struct componentname cn;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+ VN_RELE(ZTOV(dzp));
+ return (error);
+ }
+
+ cn.cn_nameptr = name;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ cn.cn_flags = SAVENAME;
+
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
+ VOP_UNLOCK(ZTOV(zp), 0, curthread);
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+
+ VN_RELE(ZTOV(zp));
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
+{
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ znode_t *sdzp, *tdzp;
+ struct componentname scn, tcn;
+ vnode_t *svp, *tvp;
+ kthread_t *td = curthread;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+ VN_RELE(ZTOV(sdzp));
+ return (error);
+ }
+
+ svp = tvp = NULL;
+
+ scn.cn_nameptr = sname;
+ scn.cn_namelen = strlen(sname);
+ scn.cn_nameiop = DELETE;
+ scn.cn_flags = ISLASTCN | SAVENAME;
+ scn.cn_cred = kcred;
+ scn.cn_thread = td;
+ vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
+ VOP_UNLOCK(ZTOV(sdzp), 0, td);
+ if (error != 0)
+ goto fail;
+ VOP_UNLOCK(svp, 0, td);
+
+ tcn.cn_nameptr = tname;
+ tcn.cn_namelen = strlen(tname);
+ tcn.cn_nameiop = RENAME;
+ tcn.cn_flags = ISLASTCN | SAVENAME;
+ tcn.cn_cred = kcred;
+ tcn.cn_thread = td;
+ vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
+ if (error == EJUSTRETURN)
+ tvp = NULL;
+ else if (error != 0) {
+ VOP_UNLOCK(ZTOV(tdzp), 0, td);
+ goto fail;
+ }
+
+ error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
+ return (error);
+fail:
+ if (svp != NULL)
+ vrele(svp);
+ if (tvp != NULL)
+ vrele(tvp);
+ VN_RELE(ZTOV(tdzp));
+ VN_RELE(ZTOV(sdzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ znode_t *zp;
+ int error;
+ ssize_t resid;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log writes out of order, it's possible the
+ * file has been removed. In this case just drop the write
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
+ lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+static int
+zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
+{
+
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ return (EOPNOTSUPP);
+}
+
+static int
+zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
+{
+ znode_t *zp;
+ vattr_t va;
+ vnode_t *vp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log setattrs out of order, it's possible the
+ * file has been removed. In this case just drop the setattr
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
+ lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+ va.va_size = lr->lr_size;
+ ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
+ ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
+
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_SETATTR(vp, &va, kcred, curthread);
+ VOP_UNLOCK(vp, 0, curthread);
+ VN_RELE(vp);
+
+ return (error);
+}
+
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+ ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
+#ifdef TODO
+ vsecattr_t vsa;
+#endif
+ znode_t *zp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_ace_byteswap(ace, lr->lr_aclcnt);
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log acls out of order, it's possible the
+ * file has been removed. In this case just drop the acl
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+#ifdef TODO
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentp = ace;
+
+ error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
+#else
+ error = EOPNOTSUPP;
+#endif
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+ zfs_replay_error, /* 0 no such transaction type */
+ zfs_replay_create, /* TX_CREATE */
+ zfs_replay_create, /* TX_MKDIR */
+ zfs_replay_create, /* TX_MKXATTR */
+ zfs_replay_create, /* TX_SYMLINK */
+ zfs_replay_remove, /* TX_REMOVE */
+ zfs_replay_remove, /* TX_RMDIR */
+ zfs_replay_link, /* TX_LINK */
+ zfs_replay_rename, /* TX_RENAME */
+ zfs_replay_write, /* TX_WRITE */
+ zfs_replay_truncate, /* TX_TRUNCATE */
+ zfs_replay_setattr, /* TX_SETATTR */
+ zfs_replay_acl, /* TX_ACL */
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
new file mode 100644
index 000000000000..07ec0f6b6e90
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains the code to implement file range locking in
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind
+ * support for growing the blocksize).
+ *
+ * Interface
+ * ---------
+ * Defined in zfs_rlock.h but essentially:
+ * rl = zfs_range_lock(zp, off, len, lock_type);
+ * zfs_range_unlock(rl);
+ * zfs_range_reduce(rl, off, len);
+ *
+ * AVL tree
+ * --------
+ * An AVL tree is used to maintain the state of the existing ranges
+ * that are locked for exclusive (writer) or shared (reader) use.
+ * The starting range offset is used for searching and sorting the tree.
+ *
+ * Common case
+ * -----------
+ * The (hopefully) usual case is of no overlaps or contention for
+ * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
+ * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ *
+ * Overlaps/Reference counting/Proxy locks
+ * ---------------------------------------
+ * The avl code only allows one node at a particular offset. Also it's very
+ * inefficient to search through all previous entries looking for overlaps
+ * (because the very 1st in the ordered list might be at offset 0 but
+ * cover the whole file).
+ * So this implementation uses reference counts and proxy range locks.
+ * Firstly, only reader locks use reference counts and proxy locks,
+ * because writer locks are exclusive.
+ * When a reader lock overlaps with another then a proxy lock is created
+ * for that range and replaces the original lock. If the overlap
+ * is exact then the reference count of the proxy is simply incremented.
+ * Otherwise, the proxy lock is split into smaller lock ranges and
+ * new proxy locks created for non overlapping ranges.
+ * The reference counts are adjusted accordingly.
+ * Meanwhile, the orginal lock is kept around (this is the callers handle)
+ * and its offset and length are used when releasing the lock.
+ *
+ * Thread coordination
+ * -------------------
+ * In order to make wakeups efficient and to ensure multiple continuous
+ * readers on a range don't starve a writer for the same range lock,
+ * two condition variables are allocated in each rl_t.
+ * If a writer (or reader) can't get a range it initialises the writer
+ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
+ * and waits on that cv. When a thread unlocks that range it wakes up all
+ * writers then all readers before destroying the lock.
+ *
+ * Append mode writes
+ * ------------------
+ * Append mode writes need to lock a range at the end of a file.
+ * The offset of the end of the file is determined under the
+ * range locking mutex, and the lock type converted from RL_APPEND to
+ * RL_WRITER and the range locked.
+ *
+ * Grow block handling
+ * -------------------
+ * ZFS supports multiple block sizes currently upto 128K. The smallest
+ * block size is used for the file which is grown as needed. During this
+ * growth all other writers and readers must be excluded.
+ * So if the block size needs to be grown then the whole file is
+ * exclusively locked, then later the caller will reduce the lock
+ * range to just the range to be written using zfs_reduce_range.
+ */
+
+#include <sys/zfs_rlock.h>
+
+/*
+ * Check if a write lock can be grabbed, or wait and recheck until available.
+ */
+static void
+zfs_range_lock_writer(znode_t *zp, rl_t *new)
+{
+ avl_tree_t *tree = &zp->z_range_avl;
+ rl_t *rl;
+ avl_index_t where;
+ uint64_t end_size;
+ uint64_t off = new->r_off;
+ uint64_t len = new->r_len;
+
+ for (;;) {
+ /*
+ * Range locking is also used by zvol and uses a
+ * dummied up znode. However, for zvol, we don't need to
+ * append or grow blocksize, and besides we don't have
+ * a z_phys or z_zfsvfs - so skip that processing.
+ *
+ * Yes, this is ugly, and would be solved by not handling
+ * grow or append in range lock code. If that was done then
+ * we could make the range locking code generically available
+ * to other non-zfs consumers.
+ */
+ if (zp->z_vnode) { /* caller is ZPL */
+ /*
+ * If in append mode pick up the current end of file.
+ * This is done under z_range_lock to avoid races.
+ */
+ if (new->r_type == RL_APPEND)
+ new->r_off = zp->z_phys->zp_size;
+
+ /*
+ * If we need to grow the block size then grab the whole
+ * file range. This is also done under z_range_lock to
+ * avoid races.
+ */
+ end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+ new->r_off = 0;
+ new->r_len = UINT64_MAX;
+ }
+ }
+
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(tree) == 0) {
+ new->r_type = RL_WRITER; /* convert to writer */
+ avl_add(tree, new);
+ return;
+ }
+
+ /*
+ * Look for any locks in the range.
+ */
+ rl = avl_find(tree, new, &where);
+ if (rl)
+ goto wait; /* already locked at same offset */
+
+ rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+ if (rl && (rl->r_off < new->r_off + new->r_len))
+ goto wait;
+
+ rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+ if (rl && rl->r_off + rl->r_len > new->r_off)
+ goto wait;
+
+ new->r_type = RL_WRITER; /* convert possible RL_APPEND */
+ avl_insert(tree, new, where);
+ return;
+wait:
+ if (!rl->r_write_wanted) {
+ cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
+ rl->r_write_wanted = B_TRUE;
+ }
+ cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
+
+ /* reset to original */
+ new->r_off = off;
+ new->r_len = len;
+ }
+}
+
+/*
+ * If this is an original (non-proxy) lock then replace it by
+ * a proxy and return the proxy.
+ */
+static rl_t *
+zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+{
+ rl_t *proxy;
+
+ if (rl->r_proxy)
+ return (rl); /* already a proxy */
+
+ ASSERT3U(rl->r_cnt, ==, 1);
+ ASSERT(rl->r_write_wanted == B_FALSE);
+ ASSERT(rl->r_read_wanted == B_FALSE);
+ avl_remove(tree, rl);
+ rl->r_cnt = 0;
+
+ /* create a proxy range lock */
+ proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ proxy->r_off = rl->r_off;
+ proxy->r_len = rl->r_len;
+ proxy->r_cnt = 1;
+ proxy->r_type = RL_READER;
+ proxy->r_proxy = B_TRUE;
+ proxy->r_write_wanted = B_FALSE;
+ proxy->r_read_wanted = B_FALSE;
+ avl_add(tree, proxy);
+
+ return (proxy);
+}
+
+/*
+ * Split the range lock at the supplied offset
+ * returning the *front* proxy.
+ */
+static rl_t *
+zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+{
+ rl_t *front, *rear;
+
+ ASSERT3U(rl->r_len, >, 1);
+ ASSERT3U(off, >, rl->r_off);
+ ASSERT3U(off, <, rl->r_off + rl->r_len);
+ ASSERT(rl->r_write_wanted == B_FALSE);
+ ASSERT(rl->r_read_wanted == B_FALSE);
+
+ /* create the rear proxy range lock */
+ rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ rear->r_off = off;
+ rear->r_len = rl->r_off + rl->r_len - off;
+ rear->r_cnt = rl->r_cnt;
+ rear->r_type = RL_READER;
+ rear->r_proxy = B_TRUE;
+ rear->r_write_wanted = B_FALSE;
+ rear->r_read_wanted = B_FALSE;
+
+ front = zfs_range_proxify(tree, rl);
+ front->r_len = off - rl->r_off;
+
+ avl_insert_here(tree, rear, front, AVL_AFTER);
+ return (front);
+}
+
+/*
+ * Create and add a new proxy range lock for the supplied range.
+ */
+static void
+zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+{
+ rl_t *rl;
+
+ ASSERT(len);
+ rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ rl->r_off = off;
+ rl->r_len = len;
+ rl->r_cnt = 1;
+ rl->r_type = RL_READER;
+ rl->r_proxy = B_TRUE;
+ rl->r_write_wanted = B_FALSE;
+ rl->r_read_wanted = B_FALSE;
+ avl_add(tree, rl);
+}
+
+static void
+zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+{
+ rl_t *next;
+ uint64_t off = new->r_off;
+ uint64_t len = new->r_len;
+
+ /*
+ * prev arrives either:
+ * - pointing to an entry at the same offset
+ * - pointing to the entry with the closest previous offset whose
+ * range may overlap with the new range
+ * - null, if there were no ranges starting before the new one
+ */
+ if (prev) {
+ if (prev->r_off + prev->r_len <= off) {
+ prev = NULL;
+ } else if (prev->r_off != off) {
+ /*
+ * convert to proxy if needed then
+ * split this entry and bump ref count
+ */
+ prev = zfs_range_split(tree, prev, off);
+ prev = AVL_NEXT(tree, prev); /* move to rear range */
+ }
+ }
+ ASSERT((prev == NULL) || (prev->r_off == off));
+
+ if (prev)
+ next = prev;
+ else
+ next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+
+ if (next == NULL || off + len <= next->r_off) {
+ /* no overlaps, use the original new rl_t in the tree */
+ avl_insert(tree, new, where);
+ return;
+ }
+
+ if (off < next->r_off) {
+ /* Add a proxy for initial range before the overlap */
+ zfs_range_new_proxy(tree, off, next->r_off - off);
+ }
+
+ new->r_cnt = 0; /* will use proxies in tree */
+ /*
+ * We now search forward through the ranges, until we go past the end
+ * of the new range. For each entry we make it a proxy if it
+ * isn't already, then bump its reference count. If there's any
+ * gaps between the ranges then we create a new proxy range.
+ */
+ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->r_off)
+ break;
+ if (prev && prev->r_off + prev->r_len < next->r_off) {
+ /* there's a gap */
+ ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
+ zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
+ next->r_off - (prev->r_off + prev->r_len));
+ }
+ if (off + len == next->r_off + next->r_len) {
+ /* exact overlap with end */
+ next = zfs_range_proxify(tree, next);
+ next->r_cnt++;
+ return;
+ }
+ if (off + len < next->r_off + next->r_len) {
+ /* new range ends in the middle of this block */
+ next = zfs_range_split(tree, next, off + len);
+ next->r_cnt++;
+ return;
+ }
+ ASSERT3U(off + len, >, next->r_off + next->r_len);
+ next = zfs_range_proxify(tree, next);
+ next->r_cnt++;
+ }
+
+ /* Add the remaining end range. */
+ zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
+ (off + len) - (prev->r_off + prev->r_len));
+}
+
+/*
+ * Check if a reader lock can be grabbed, or wait and recheck until available.
+ */
+static void
+zfs_range_lock_reader(znode_t *zp, rl_t *new)
+{
+ avl_tree_t *tree = &zp->z_range_avl;
+ rl_t *prev, *next;
+ avl_index_t where;
+ uint64_t off = new->r_off;
+ uint64_t len = new->r_len;
+
+ /*
+ * Look for any writer locks in the range.
+ */
+retry:
+ prev = avl_find(tree, new, &where);
+ if (prev == NULL)
+ prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+
+ /*
+ * Check the previous range for a writer lock overlap.
+ */
+ if (prev && (off < prev->r_off + prev->r_len)) {
+ if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
+ if (!prev->r_read_wanted) {
+ cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
+ prev->r_read_wanted = B_TRUE;
+ }
+ cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
+ goto retry;
+ }
+ if (off + len < prev->r_off + prev->r_len)
+ goto got_lock;
+ }
+
+ /*
+ * Search through the following ranges to see if there's
+ * write lock any overlap.
+ */
+ if (prev)
+ next = AVL_NEXT(tree, prev);
+ else
+ next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+ for (; next; next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->r_off)
+ goto got_lock;
+ if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
+ if (!next->r_read_wanted) {
+ cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
+ next->r_read_wanted = B_TRUE;
+ }
+ cv_wait(&next->r_rd_cv, &zp->z_range_lock);
+ goto retry;
+ }
+ if (off + len <= next->r_off + next->r_len)
+ goto got_lock;
+ }
+
+got_lock:
+ /*
+ * Add the read lock, which may involve splitting existing
+ * locks and bumping ref counts (r_cnt).
+ */
+ zfs_range_add_reader(tree, new, prev, where);
+}
+
+/*
+ * Lock a range (offset, length) as either shared (RL_READER)
+ * or exclusive (RL_WRITER). Returns the range lock structure
+ * for later unlocking or reduce range (if entire file
+ * previously locked as RL_WRITER).
+ */
+rl_t *
+zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
+{
+ rl_t *new;
+
+ ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
+
+ new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ new->r_zp = zp;
+ new->r_off = off;
+ new->r_len = len;
+ new->r_cnt = 1; /* assume it's going to be in the tree */
+ new->r_type = type;
+ new->r_proxy = B_FALSE;
+ new->r_write_wanted = B_FALSE;
+ new->r_read_wanted = B_FALSE;
+
+ mutex_enter(&zp->z_range_lock);
+ if (type == RL_READER) {
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(&zp->z_range_avl) == 0)
+ avl_add(&zp->z_range_avl, new);
+ else
+ zfs_range_lock_reader(zp, new);
+ } else
+ zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
+ mutex_exit(&zp->z_range_lock);
+ return (new);
+}
+
+/*
+ * Unlock a reader lock
+ */
+static void
+zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
+{
+ avl_tree_t *tree = &zp->z_range_avl;
+ rl_t *rl, *next;
+ uint64_t len;
+
+ /*
+ * The common case is when the remove entry is in the tree
+ * (cnt == 1) meaning there's been no other reader locks overlapping
+ * with this one. Otherwise the remove entry will have been
+ * removed from the tree and replaced by proxies (one or
+ * more ranges mapping to the entire range).
+ */
+ if (remove->r_cnt == 1) {
+ avl_remove(tree, remove);
+ if (remove->r_write_wanted)
+ cv_broadcast(&remove->r_wr_cv);
+ if (remove->r_read_wanted)
+ cv_broadcast(&remove->r_rd_cv);
+ } else {
+ ASSERT3U(remove->r_cnt, ==, 0);
+ ASSERT3U(remove->r_write_wanted, ==, 0);
+ ASSERT3U(remove->r_read_wanted, ==, 0);
+ /*
+ * Find start proxy representing this reader lock,
+ * then decrement ref count on all proxies
+ * that make up this range, freeing them as needed.
+ */
+ rl = avl_find(tree, remove, NULL);
+ ASSERT(rl);
+ ASSERT(rl->r_cnt);
+ ASSERT(rl->r_type == RL_READER);
+ for (len = remove->r_len; len != 0; rl = next) {
+ len -= rl->r_len;
+ if (len) {
+ next = AVL_NEXT(tree, rl);
+ ASSERT(next);
+ ASSERT(rl->r_off + rl->r_len == next->r_off);
+ ASSERT(next->r_cnt);
+ ASSERT(next->r_type == RL_READER);
+ }
+ rl->r_cnt--;
+ if (rl->r_cnt == 0) {
+ avl_remove(tree, rl);
+ if (rl->r_write_wanted)
+ cv_broadcast(&rl->r_wr_cv);
+ if (rl->r_read_wanted)
+ cv_broadcast(&rl->r_rd_cv);
+ kmem_free(rl, sizeof (rl_t));
+ }
+ }
+ }
+ kmem_free(remove, sizeof (rl_t));
+}
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void
+zfs_range_unlock(rl_t *rl)
+{
+ znode_t *zp = rl->r_zp;
+
+ ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
+ ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
+ ASSERT(!rl->r_proxy);
+
+ mutex_enter(&zp->z_range_lock);
+ if (rl->r_type == RL_WRITER) {
+ /* writer locks can't be shared or split */
+ avl_remove(&zp->z_range_avl, rl);
+ mutex_exit(&zp->z_range_lock);
+ if (rl->r_write_wanted) {
+ cv_broadcast(&rl->r_wr_cv);
+ cv_destroy(&rl->r_wr_cv);
+ }
+ if (rl->r_read_wanted) {
+ cv_broadcast(&rl->r_rd_cv);
+ cv_destroy(&rl->r_rd_cv);
+ }
+ kmem_free(rl, sizeof (rl_t));
+ } else {
+ /*
+ * lock may be shared, let zfs_range_unlock_reader()
+ * release the lock and free the rl_t
+ */
+ zfs_range_unlock_reader(zp, rl);
+ mutex_exit(&zp->z_range_lock);
+ }
+}
+
+/*
+ * Reduce range locked as RL_WRITER from whole file to specified range.
+ * Asserts the whole file is exclusivly locked and so there's only one
+ * entry in the tree.
+ */
+void
+zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+{
+ znode_t *zp = rl->r_zp;
+
+ /* Ensure there are no other locks */
+ ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
+ ASSERT(rl->r_off == 0);
+ ASSERT(rl->r_type == RL_WRITER);
+ ASSERT(!rl->r_proxy);
+ ASSERT3U(rl->r_len, ==, UINT64_MAX);
+ ASSERT3U(rl->r_cnt, ==, 1);
+
+ mutex_enter(&zp->z_range_lock);
+ rl->r_off = off;
+ rl->r_len = len;
+ mutex_exit(&zp->z_range_lock);
+ if (rl->r_write_wanted)
+ cv_broadcast(&rl->r_wr_cv);
+ if (rl->r_read_wanted)
+ cv_broadcast(&rl->r_rd_cv);
+}
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+int
+zfs_range_compare(const void *arg1, const void *arg2)
+{
+ const rl_t *rl1 = arg1;
+ const rl_t *rl2 = arg2;
+
+ if (rl1->r_off > rl2->r_off)
+ return (1);
+ if (rl1->r_off < rl2->r_off)
+ return (-1);
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..27e00c3578e9
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -0,0 +1,986 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/varargs.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnlc.h>
+
+struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
+
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+int zfs_debug_level = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
+ "Debug level");
+
+static int zfs_mount(vfs_t *vfsp, kthread_t *td);
+static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+static void zfs_freevfs(vfs_t *vfsp);
+
+static struct vfsops zfs_vfsops = {
+ .vfs_mount = zfs_mount,
+ .vfs_unmount = zfs_umount,
+ .vfs_root = zfs_root,
+ .vfs_statfs = zfs_statfs,
+ .vfs_vget = zfs_vget,
+ .vfs_sync = zfs_sync,
+ .vfs_fhtovp = zfs_fhtovp,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t zfs_active_fs_count = 0;
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
+{
+
+ /*
+ * Data integrity is job one. We don't want a compromised kernel
+ * writing to the storage pool, so we never sync during panic.
+ */
+ if (panicstr)
+ return (0);
+
+ if (vfsp != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ int error;
+
+ error = vfs_stdsync(vfsp, waitfor, td);
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
+ else
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1M). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ zfsvfs->z_atime = TRUE;
+ zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+ } else {
+ zfsvfs->z_atime = FALSE;
+ zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+ }
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ /* XXX locking on vfs_flag? */
+#ifdef TODO
+ zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
+#endif
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+#ifdef TODO
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
+#endif
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval < SPA_MINBLOCKSIZE ||
+ newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
+ newval = SPA_MAXBLOCKSIZE;
+
+ zfsvfs->z_max_blksz = newval;
+ zfsvfs->z_vfs->vfs_bsize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval) {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+ }
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+ }
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+ }
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_inherit = newval;
+}
+
+static int
+zfs_refresh_properties(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * Remount operations default to "rw" unless "ro" is explicitly
+ * specified.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ } else {
+ if (!dmu_objset_is_snapshot(zfsvfs->z_os))
+ readonly_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+ return (EROFS);
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_TRUE);
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ exec_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ exec_changed_cb(zfsvfs, B_TRUE);
+
+ if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+ atime_changed_cb(zfsvfs, B_TRUE);
+ else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+ atime_changed_cb(zfsvfs, B_FALSE);
+
+ if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
+ xattr_changed_cb(zfsvfs, B_TRUE);
+ else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
+ xattr_changed_cb(zfsvfs, B_FALSE);
+
+ return (0);
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ int readonly, do_readonly = FALSE;
+ int setuid, do_setuid = FALSE;
+ int exec, do_exec = FALSE;
+ int xattr, do_xattr = FALSE;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
+ }
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+ xattr = B_FALSE;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+ xattr = B_TRUE;
+ do_xattr = B_TRUE;
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ ds = dmu_objset_ds(os);
+ error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "xattr", xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "recordsize", blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "readonly", readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "setuid", setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "exec", exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "snapdir", snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "aclmode", acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "aclinherit", acl_inherit_changed_cb, zfsvfs);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_xattr)
+ xattr_changed_cb(zfsvfs, xattr);
+
+ return (0);
+
+unregister:
+ /*
+ * We may attempt to unregister some callbacks that are not
+ * registered, but this is OK; it will simply return ENOMSG,
+ * which we will ignore.
+ */
+ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
+ zfsvfs);
+ return (error);
+
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
+{
+ cred_t *cr = td->td_ucred;
+ uint64_t recordsize, readonly;
+ int error = 0;
+ int mode;
+ zfsvfs_t *zfsvfs;
+ znode_t *zp = NULL;
+
+ ASSERT(vfsp);
+ ASSERT(osname);
+
+ /*
+ * Initialize the zfs-specific filesystem structure.
+ * Should probably make this a kmem cache, shuffle fields,
+ * and just bzero up to z_hold_mtx[].
+ */
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ zfsvfs->z_vfs = vfsp;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_assign = TXG_NOWAIT;
+ zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
+
+ if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
+ NULL))
+ goto out;
+ zfsvfs->z_vfs->vfs_bsize = recordsize;
+
+ vfsp->vfs_data = zfsvfs;
+ vfsp->mnt_flag |= MNT_LOCAL;
+ vfsp->mnt_kern_flag |= MNTK_MPSAFE;
+ vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+
+ if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
+ goto out;
+
+ if (readonly)
+ mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+ else
+ mode = DS_MODE_PRIMARY;
+
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+ if (error == EROFS) {
+ /*
+ * FreeBSD: In Solaris there is DS_MODE_PRIMARY instead of
+ * DS_MODE_STANDARD, but it doesn't work on FreeBSD and
+ * I don't know why. It looks like the dataset is opened
+ * on mount DS_MODE_PRIMARY mode and snapshot cannot open
+ * the same dataset in DS_MODE_PRIMARY mode again.
+ */
+ mode = DS_MODE_STANDARD | DS_MODE_READONLY;
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
+ &zfsvfs->z_os);
+ }
+
+ if (error)
+ goto out;
+
+ if (error = zfs_init_fs(zfsvfs, &zp, cr))
+ goto out;
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t xattr;
+
+ ASSERT(mode & DS_MODE_READONLY);
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
+ goto out;
+ xattr_changed_cb(zfsvfs, xattr);
+ zfsvfs->z_issnap = B_TRUE;
+ } else {
+ error = zfs_register_callbacks(vfsp);
+ if (error)
+ goto out;
+
+ zfs_unlinked_drain(zfsvfs);
+
+ /*
+ * Parse and replay the intent log.
+ */
+ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+ zfs_replay_vector);
+
+ if (!zil_disable)
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+ }
+
+ vfs_mountedfrom(vfsp, osname);
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ if (zfsvfs->z_os)
+ dmu_objset_close(zfsvfs->z_os);
+ rw_destroy(&zfsvfs->z_um_lock);
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ } else {
+ atomic_add_32(&zfs_active_fs_count, 1);
+ }
+
+ return (error);
+
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+ struct dsl_dataset *ds;
+
+ /*
+ * Unregister properties.
+ */
+ if (!dmu_objset_is_snapshot(os)) {
+ ds = dmu_objset_ds(os);
+ VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "aclinherit",
+ acl_inherit_changed_cb, zfsvfs) == 0);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, kthread_t *td)
+{
+ char *from;
+ int error;
+
+ /* TODO: For now deny user mounts. */
+ if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
+ return (error);
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (vfsp->vfs_flag & MS_REMOUNT)
+ return (zfs_refresh_properties(vfsp));
+
+ if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
+ return (EINVAL);
+
+ return (zfs_domount(vfsp, from, td));
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+ statp->f_version = STATFS_VERSION;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ /*
+ * The underlying storage pool actually uses multiple block sizes.
+ * We report the fragsize as the smallest block size we support,
+ * and we report our blocksize as the filesystem's maximum blocksize.
+ */
+ statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
+ statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
+
+ /*
+ * The following report "total" blocks of various kinds in the
+ * file system, but reported in terms of f_frsize - the
+ * "fragment" size.
+ */
+
+ statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
+ statp->f_bfree = availbytes / statp->f_bsize;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of object available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, statp->f_bfree);
+ statp->f_files = statp->f_ffree + usedobjs;
+
+ /*
+ * We're a zfs filesystem.
+ */
+ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
+
+ strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+ sizeof(statp->f_mntfromname));
+ strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+ sizeof(statp->f_mntonname));
+
+ statp->f_namemax = ZFS_MAXNAMELEN;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0) {
+ *vpp = ZTOV(rootzp);
+ error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ (*vpp)->v_vflag |= VV_ROOT;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ cred_t *cr = td->td_ucred;
+ int ret;
+
+ if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (ret);
+
+ (void) dnlc_purge_vfsp(vfsp, 0);
+
+ /*
+ * Unmount any snapshots mounted under .zfs before unmounting the
+ * dataset itself.
+ */
+ if (zfsvfs->z_ctldir != NULL) {
+ if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+ return (ret);
+ ret = vflush(vfsp, 0, 0, td);
+ ASSERT(ret == EBUSY);
+ if (!(fflag & MS_FORCE)) {
+ if (zfsvfs->z_ctldir->v_count > 1)
+ return (EBUSY);
+ ASSERT(zfsvfs->z_ctldir->v_count == 1);
+ }
+ zfsctl_destroy(zfsvfs);
+ ASSERT(zfsvfs->z_ctldir == NULL);
+ }
+
+ /*
+ * Flush all the files.
+ */
+ ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+ if (ret != 0) {
+ if (!zfsvfs->z_issnap) {
+ zfsctl_create(zfsvfs);
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ }
+ return (ret);
+ }
+
+ if (fflag & MS_FORCE) {
+ MNT_ILOCK(vfsp);
+ vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ MNT_IUNLOCK(vfsp);
+ zfsvfs->z_unmounted1 = B_TRUE;
+
+ /*
+ * Wait for all zfs threads to leave zfs.
+ * Grabbing a rwlock as reader in all vops and
+ * as writer here doesn't work because it too easy to get
+ * multiple reader enters as zfs can re-enter itself.
+ * This can lead to deadlock if there is an intervening
+ * rw_enter as writer.
+ * So a file system threads ref count (z_op_cnt) is used.
+ * A polling loop on z_op_cnt may seem inefficient, but
+ * - this saves all threads on exit from having to grab a
+ * mutex in order to cv_signal
+ * - only occurs on forced unmount in the rare case when
+ * there are outstanding threads within the file system.
+ */
+ while (zfsvfs->z_op_cnt) {
+ delay(1);
+ }
+ }
+
+ zfs_objset_close(zfsvfs);
+ VFS_RELE(vfsp);
+ zfs_freevfs(vfsp);
+
+ return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+ err = zfs_zget(zfsvfs, ino, &zp);
+ if (err == 0 && zp->z_unlinked) {
+ VN_RELE(ZTOV(zp));
+ err = EINVAL;
+ }
+ if (err != 0)
+ *vpp = NULL;
+ else {
+ *vpp = ZTOV(zp);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
+{
+ kthread_t *td = curthread;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *vpp = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ ZFS_EXIT(zfsvfs);
+
+ err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+ if (err)
+ return (EINVAL);
+ ZFS_ENTER(zfsvfs);
+ }
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /* A zero fid_gen means we are in the .zfs control directories */
+ if (fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+ *vpp = zfsvfs->z_ctldir;
+ ASSERT(*vpp != NULL);
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
+ 0, NULL, NULL) == 0);
+ } else {
+ VN_HOLD(*vpp);
+ }
+ ZFS_EXIT(zfsvfs);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ return (0);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+ if (err = zfs_zget(zfsvfs, object, &zp)) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ zp_gen = zp->z_phys->zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+ VN_RELE(ZTOV(zp));
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ *vpp = ZTOV(zp);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static void
+zfs_objset_close(zfsvfs_t *zfsvfs)
+{
+ znode_t *zp, *nextzp;
+ objset_t *os = zfsvfs->z_os;
+
+ /*
+ * For forced unmount, at this point all vops except zfs_inactive
+ * are erroring EIO. We need to now suspend zfs_inactive threads
+ * while we are freeing dbufs before switching zfs_inactive
+ * to use behaviour without a objset.
+ */
+ rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
+
+ /*
+ * Release all holds on dbufs
+ * Note, although we have stopped all other vop threads and
+ * zfs_inactive(), the dmu can callback via znode_pageout_func()
+ * which can zfs_znode_free() the znode.
+ * So we lock z_all_znodes; search the list for a held
+ * dbuf; drop the lock (we know zp can't disappear if we hold
+ * a dbuf lock; then regrab the lock and restart.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
+ nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+ if (zp->z_dbuf_held) {
+ /* dbufs should only be held when force unmounting */
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+ /* Start again */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ nextzp = list_head(&zfsvfs->z_all_znodes);
+ }
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * Unregister properties.
+ */
+ if (!dmu_objset_is_snapshot(os))
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Switch zfs_inactive to behaviour without an objset.
+ * It just tosses cached pages and frees the znode & vnode.
+ * Then re-enable zfs_inactive threads in that new behaviour.
+ */
+ zfsvfs->z_unmounted2 = B_TRUE;
+ rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
+
+ /*
+ * Close the zil. Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ /*
+ * Evict all dbufs so that cached znodes will be freed
+ */
+ if (dmu_objset_evict_dbufs(os, 1)) {
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ (void) dmu_objset_evict_dbufs(os, 0);
+ }
+
+ /*
+ * Finally close the objset
+ */
+ dmu_objset_close(os);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ int i;
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ rw_destroy(&zfsvfs->z_um_lock);
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+
+ atomic_add_32(&zfs_active_fs_count, -1);
+}
+
+void
+zfs_init(void)
+{
+
+ printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
+
+ /*
+ * Initialize .zfs directory structures
+ */
+ zfsctl_init();
+
+ /*
+ * Initialize znode cache, vnode ops, etc...
+ */
+ zfs_znode_init();
+}
+
+void
+zfs_fini(void)
+{
+ zfsctl_fini();
+ zfs_znode_fini();
+}
+
+int
+zfs_busy(void)
+{
+ return (zfs_active_fs_count != 0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..6769177a1714
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -0,0 +1,3227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dirent.h>
+#include <sys/filio.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnlc.h>
+#include <sys/zfs_rlock.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sf_buf.h>
+#include <sys/sched.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait the the intent log to commit if it's is a synchronous operation.
+ * Morover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *
+ * (2) VN_RELE() should always be the last thing except for zil_commit()
+ * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which acquires range locks) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
+ * In normal operation, this will be TXG_NOWAIT. During ZIL replay,
+ * it will be a specific txg. Either way, dmu_tx_assign() never blocks.
+ * This is critical because we don't want to block while holding locks.
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing to
+ * use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * zil_commit(zilog, seq, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+/* ARGSUSED */
+static int
+zfs_open(ap)
+ struct vop_open_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ int flag = ap->a_mode;
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & FFSYNC) {
+ atomic_inc_32(&zp->z_sync_cnt);
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ }
+
+ vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(ap)
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ znode_t *zp = VTOZ(ap->a_vp);
+ int flag = ap->a_fflag;
+
+ /* Decrement the synchronous opens in the znode */
+ if (flag & FFSYNC) {
+ atomic_dec_32(&zp->z_sync_cnt);
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ }
+
+ return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, int cmd, offset_t *off)
+{
+ znode_t *zp = VTOZ(vp);
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_phys->zp_size;
+ if (noff >= file_sz) {
+ return (ENXIO);
+ }
+
+ if (cmd == FIOSEEKHOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+
+ /* end of file? */
+ if ((error == ESRCH) || (noff > file_sz)) {
+ /*
+ * Handle the virtual hole at the end of file.
+ */
+ if (hole) {
+ *off = file_sz;
+ return (0);
+ }
+ return (ENXIO);
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(ap)
+ struct vop_ioctl_args /* {
+ struct vnode *a_vp;
+ u_long a_command;
+ caddr_t a_data;
+ int fflag;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ int com = ap->a_command;
+ caddr_t data = ap->a_data;
+ offset_t off;
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ switch (com) {
+ case FIOSEEKDATA:
+ case FIOSEEKHOLE:
+ off = *(offset_t *)data;
+
+ zfsvfs = VTOZ(vp)->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+
+ /* offset parameter is in/out */
+ error = zfs_holey(vp, com, &off);
+ ZFS_EXIT(zfsvfs);
+ if (error)
+ return (error);
+ *(offset_t *)data = off;
+ return (0);
+ }
+ return (ENOTTY);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ vm_object_t obj;
+ vm_page_t m;
+ struct sf_buf *sf;
+ int64_t start, off;
+ int len = nbytes;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ VM_OBJECT_LOCK(obj);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+ uint64_t woff = uio->uio_loffset;
+
+again:
+ if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
+ caddr_t va;
+
+ if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
+ goto again;
+ vm_page_busy(m);
+ VM_OBJECT_UNLOCK(obj);
+ sched_pin();
+ sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
+ va = (caddr_t)sf_buf_kva(sf);
+ error = uiomove(va+off, bytes, UIO_WRITE, uio);
+ if (error == 0)
+ dmu_write(os, zp->z_id, woff, bytes, va+off, tx);
+ sf_buf_free(sf);
+ sched_unpin();
+ VM_OBJECT_LOCK(obj);
+ vm_page_wakeup(m);
+ } else {
+ VM_OBJECT_UNLOCK(obj);
+ error = dmu_write_uio(os, zp->z_id, uio, bytes, tx);
+ VM_OBJECT_LOCK(obj);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ VM_OBJECT_UNLOCK(obj);
+ return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ vm_object_t obj;
+ vm_page_t m;
+ struct sf_buf *sf;
+ int64_t start, off;
+ int len = nbytes;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ VM_OBJECT_LOCK(obj);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+
+again:
+ if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
+ caddr_t va;
+
+ if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
+ goto again;
+ vm_page_busy(m);
+ VM_OBJECT_UNLOCK(obj);
+ sched_pin();
+ sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
+ va = (caddr_t)sf_buf_kva(sf);
+ error = uiomove(va + off, bytes, UIO_READ, uio);
+ sf_buf_free(sf);
+ sched_unpin();
+ VM_OBJECT_LOCK(obj);
+ vm_page_wakeup(m);
+ } else {
+ VM_OBJECT_UNLOCK(obj);
+ error = dmu_read_uio(os, zp->z_id, uio, bytes);
+ VM_OBJECT_LOCK(obj);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ VM_OBJECT_UNLOCK(obj);
+ return (error);
+}
+
+offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: vp - vnode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - SYNC flags; used to provide FRSYNC semantics.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Side Effects:
+ * vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(ap)
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ int ioflag = ap->a_ioflag;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ ssize_t n, nbytes;
+ int error;
+ rl_t *rl;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ */
+ if (ioflag & IO_SYNC)
+ zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_phys->zp_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
+ n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+
+ while (n > 0) {
+ nbytes = MIN(n, zfs_read_chunk_size -
+ P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+ if (vn_has_cached_data(vp))
+ error = mappedread(vp, nbytes, uio);
+ else
+ error = dmu_read_uio(os, zp->z_id, uio, nbytes);
+ if (error)
+ break;
+ n -= nbytes;
+ }
+
+out:
+ zfs_range_unlock(rl);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified.
+ * Any error will exit this routine as this is only a best
+ * attempt to get the pages resident. This is a copy of ufs_trans_touch().
+ */
+static void
+zfs_prefault_write(ssize_t n, struct uio *uio)
+{
+ struct iovec *iov;
+ ulong_t cnt, incr;
+ caddr_t p;
+
+ if (uio->uio_segflg != UIO_USERSPACE)
+ return;
+
+ iov = uio->uio_iov;
+
+ while (n) {
+ cnt = MIN(iov->iov_len, n);
+ if (cnt == 0) {
+ /* empty iov entry */
+ iov++;
+ continue;
+ }
+ n -= cnt;
+ /*
+ * touch each page in this segment.
+ */
+ p = iov->iov_base;
+ while (cnt) {
+ if (fubyte(p) == -1)
+ return;
+ incr = MIN(cnt, PAGESIZE);
+ p += incr;
+ cnt -= incr;
+ }
+ /*
+ * touch the last byte in case it straddles a page.
+ */
+ p--;
+ if (fubyte(p) == -1)
+ return;
+ iov++;
+ }
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: vp - vnode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - IO_APPEND flag set if in append mode.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_write(ap)
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ int ioflag = ap->a_ioflag;
+ cred_t *cr = ap->a_cred;
+ znode_t *zp = VTOZ(vp);
+ ssize_t start_resid = uio->uio_resid;
+ ssize_t tx_bytes;
+ uint64_t end_size;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ offset_t woff;
+ ssize_t n, nbytes;
+ rl_t *rl;
+ int max_blksz = zfsvfs->z_max_blksz;
+ int error;
+
+ /*
+ * Fasttrack empty write
+ */
+ n = start_resid;
+ if (n == 0)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ */
+ zfs_prefault_write(n, uio);
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ if (ioflag & IO_APPEND) {
+ /*
+ * Range lock for a file append:
+ * The value for the start of range will be determined by
+ * zfs_range_lock() (to guarantee append semantics).
+ * If this write will cause the block size to increase,
+ * zfs_range_lock() will lock the entire file, so we must
+ * later reduce the range after we grow the block size.
+ */
+ rl = zfs_range_lock(zp, 0, n, RL_APPEND);
+ if (rl->r_len == UINT64_MAX) {
+ /* overlocked, zp_size can't change */
+ woff = uio->uio_loffset = zp->z_phys->zp_size;
+ } else {
+ woff = uio->uio_loffset = rl->r_off;
+ }
+ } else {
+ woff = uio->uio_loffset;
+ /*
+ * Validate file offset
+ */
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * If we need to grow the block size then zfs_range_lock()
+ * will lock a wider range than we request here.
+ * Later after growing the block size we reduce the range.
+ */
+ rl = zfs_range_lock(zp, woff, n, RL_WRITER);
+ }
+
+ end_size = MAX(zp->z_phys->zp_size, woff + n);
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ /*
+ * Start a transaction.
+ */
+ woff = uio->uio_loffset;
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ continue;
+ }
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * If zfs_range_lock() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since zfs_range_reduce() will
+ * shrink down r_len to the appropriate size.
+ */
+ if (rl->r_len == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_range_reduce(rl, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+ rw_enter(&zp->z_map_lock, RW_READER);
+
+ tx_bytes = uio->uio_resid;
+ if (woff + nbytes > zp->z_phys->zp_size)
+ vnode_pager_setsize(vp, woff + nbytes);
+
+ if (vn_has_cached_data(vp)) {
+ rw_exit(&zp->z_map_lock);
+ error = mappedwrite(vp, nbytes, uio, tx);
+ } else {
+ error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+ uio, nbytes, tx);
+ rw_exit(&zp->z_map_lock);
+ }
+ tx_bytes -= uio->uio_resid;
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the excute bits is set.
+ *
+ * It would be nice to to this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(cr,
+ (zp->z_phys->zp_mode & S_ISUID) != 0 &&
+ zp->z_phys->zp_uid == 0) != 0) {
+ zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ /*
+ * Update time stamp. NOTE: This marks the bonus buffer as
+ * dirty, so we don't have to do it again for zp_size.
+ */
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
+ (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ uio->uio_loffset);
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT(tx_bytes == nbytes);
+ n -= nbytes;
+ }
+
+ zfs_range_unlock(rl);
+
+ /*
+ * If we're in replay mode, or we made no progress, return error.
+ * Otherwise, it's at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (ioflag & IO_SYNC)
+ zil_commit(zilog, zp->z_last_itx, zp->z_id);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+void
+zfs_get_done(dmu_buf_t *db, void *vzgd)
+{
+ zgd_t *zgd = (zgd_t *)vzgd;
+ rl_t *rl = zgd->zgd_rl;
+ vnode_t *vp = ZTOV(rl->r_zp);
+ int vfslocked;
+
+ vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
+ dmu_buf_rele(db, vzgd);
+ zfs_range_unlock(rl);
+ VN_RELE(vp);
+ zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+ kmem_free(zgd, sizeof (zgd_t));
+ VFS_UNLOCK_GIANT(vfslocked);
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t off = lr->lr_offset;
+ dmu_buf_t *db;
+ rl_t *rl;
+ zgd_t *zgd;
+ int dlen = lr->lr_length; /* length of user data */
+ int error = 0;
+
+ ASSERT(zio);
+ ASSERT(dlen != 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+ return (ENOENT);
+ if (zp->z_unlinked) {
+ VN_RELE(ZTOV(zp));
+ return (ENOENT);
+ }
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ rl = zfs_range_lock(zp, off, dlen, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (off >= zp->z_phys->zp_size) {
+ error = ENOENT;
+ goto out;
+ }
+ VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
+ } else { /* indirect write */
+ uint64_t boff; /* block starting offset */
+
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and it's checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ if (ISP2(zp->z_blksz)) {
+ boff = P2ALIGN_TYPED(off, zp->z_blksz,
+ uint64_t);
+ } else {
+ boff = 0;
+ }
+ dlen = zp->z_blksz;
+ rl = zfs_range_lock(zp, boff, dlen, RL_READER);
+ if (zp->z_blksz == dlen)
+ break;
+ zfs_range_unlock(rl);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (off >= zp->z_phys->zp_size) {
+ error = ENOENT;
+ goto out;
+ }
+ zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_rl = rl;
+ zgd->zgd_zilog = zfsvfs->z_log;
+ zgd->zgd_bp = &lr->lr_blkptr;
+ VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
+ ASSERT(boff == db->db_offset);
+ lr->lr_blkoff = off - boff;
+ error = dmu_sync(zio, db, &lr->lr_blkptr,
+ lr->lr_common.lrc_txg, zfs_get_done, zgd);
+ ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
+ if (error == 0) {
+ zil_add_vdev(zfsvfs->z_log,
+ DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+ }
+ /*
+ * If we get EINPROGRESS, then we need to wait for a
+ * write IO initiated by dmu_sync() to complete before
+ * we can release this dbuf. We will finish everything
+ * up in the zfs_get_done() callback.
+ */
+ if (error == EINPROGRESS)
+ return (0);
+ dmu_buf_rele(db, zgd);
+ kmem_free(zgd, sizeof (zgd_t));
+ }
+out:
+ zfs_range_unlock(rl);
+ VN_RELE(ZTOV(zp));
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(ap)
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_zaccess_rwx(zp, ap->a_mode, ap->a_cred);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ * IN: dvp - vnode of directory to search.
+ * nm - name of entry to lookup.
+ * pnp - full pathname to lookup [UNUSED].
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * rdir - root directory vnode [UNUSED].
+ * cr - credentials of caller.
+ *
+ * OUT: vpp - vnode of located entry, NULL if not found.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+int
+zfs_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ kthread_t *td = ap->a_cnp->cn_thread;
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ u_long flags = cnp->cn_flags;
+ cred_t *cr = cnp->cn_cred;
+ int nameiop = cnp->cn_nameiop;
+ char nm[NAME_MAX + 1];
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error;
+
+ ASSERT(cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
+
+ ZFS_ENTER(zfsvfs);
+
+ *vpp = NULL;
+
+#ifdef TODO
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * If the xattr property is off, refuse the lookup request.
+ */
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+
+ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
+ VN_RELE(*vpp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+#endif /* TODO */
+
+ if (dvp->v_type != VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOTDIR);
+ }
+
+ /*
+ * Check accessibility of directory.
+ */
+
+ if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_dirlook(zdp, nm, vpp);
+
+ ZFS_EXIT(zfsvfs);
+
+ /* Translate errors and add SAVENAME when needed. */
+ if (cnp->cn_flags & ISLASTCN) {
+ switch (nameiop) {
+ case CREATE:
+ case RENAME:
+ if (error == ENOENT) {
+ error = EJUSTRETURN;
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DELETE:
+ if (error == 0)
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ }
+ if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
+ if (flags & ISDOTDOT)
+ VOP_UNLOCK(dvp, 0, td);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (flags & ISDOTDOT)
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+ }
+
+#ifdef FREEBSD_NAMECACHE
+ /*
+ * Insert name into cache (as non-existent) if appropriate.
+ */
+ if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+ cache_enter(dvp, *vpp, cnp);
+ /*
+ * Insert name into cache if appropriate.
+ */
+ if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ if (!(cnp->cn_flags & ISLASTCN) ||
+ (nameiop != DELETE && nameiop != RENAME)) {
+ cache_enter(dvp, *vpp, cnp);
+ }
+ }
+#endif
+
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the vp of the created or trunc'd file.
+ *
+ * IN: dvp - vnode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - large file flag [UNUSED].
+ *
+ * OUT: vpp - vnode of created or trunc'd entry.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated if new entry created
+ * vp - ctime|mtime always, atime if new
+ */
+/* ARGSUSED */
+static int
+zfs_create(ap)
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ objset_t *os = zfsvfs->z_os;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t zoid;
+ int mode;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ mode = vap->va_mode & ALLPERMS;
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ *vpp = NULL;
+
+ if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
+ vap->va_mode &= ~VSVTX;
+
+ if (*name == '\0') {
+ /*
+ * Null component name refers to the directory itself.
+ */
+ VN_HOLD(dvp);
+ zp = dzp;
+ dl = NULL;
+ error = 0;
+ } else {
+ /* possible VN_HOLD(zp) */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
+ if (strcmp(name, "..") == 0)
+ error = EISDIR;
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ zoid = zp ? zp->z_id : -1ULL;
+
+ if (zp == NULL) {
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+ if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+ ASSERT(zp->z_id == zoid);
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+ zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
+ dmu_tx_commit(tx);
+ } else {
+ /*
+ * A directory entry already exists for this name.
+ */
+
+ /*
+ * Can't open a directory for writing.
+ */
+ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
+ error = EISDIR;
+ goto out;
+ }
+ /*
+ * Verify requested access to file.
+ */
+ if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
+ goto out;
+ }
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_seq++;
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * Truncate regular files if requested.
+ */
+ if ((ZTOV(zp)->v_type == VREG) &&
+ (zp->z_phys->zp_size != 0) &&
+ (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+ error = zfs_freesp(zp, 0, 0, mode, TRUE);
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ /* NB: we already did dmu_tx_wait() */
+ zfs_dirent_unlock(dl);
+ VN_RELE(ZTOV(zp));
+ goto top;
+ }
+ }
+ }
+out:
+
+ if (error == 0) {
+ *vpp = ZTOV(zp);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+ }
+
+ if (dl)
+ zfs_dirent_unlock(dl);
+
+ if (error) {
+ if (zp)
+ VN_RELE(ZTOV(zp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dvp - vnode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime
+ * vp - ctime (if nlink > 0)
+ */
+static int
+zfs_remove(ap)
+ struct vop_remove_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *zp, *dzp = VTOZ(dvp);
+#if 0
+ znode_t *xzp = NULL;
+#endif
+ vnode_t *vp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t xattr_obj;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ boolean_t unlinked;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vp = ZTOV(zp);
+ ASSERT(vp == ap->a_vp);
+ ASSERT(vp->v_type != VDIR);
+ /* Drop an extra reference from zfs_dirent_lock(). */
+ VN_RELE(vp);
+ ASSERT(vp->v_count > 0);
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr))
+ goto out;
+
+ dnlc_remove(dvp, name);
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the vnode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ /* are there any extended attributes? */
+ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
+ /* XXX - do we need this if we are deleting? */
+ dmu_tx_hold_bonus(tx, xattr_obj);
+ }
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
+
+ dmu_tx_commit(tx);
+out:
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dvp - vnode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ *
+ * OUT: vpp - vnode of created directory.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ * vp - ctime|mtime|atime updated
+ */
+static int
+zfs_mkdir(ap)
+ struct vop_mkdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *dirname = ap->a_cnp->cn_nameptr;
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *dl;
+ uint64_t zoid = 0;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+ ASSERT(vap->va_type == VDIR);
+ vattr_init_mask(vap);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+top:
+ *vpp = NULL;
+
+ /*
+ * First make sure the new directory doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+
+ *vpp = ZTOV(zp);
+
+ zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
+ dmu_tx_commit(tx);
+
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dvp - vnode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - vnode of current working directory.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+static int
+zfs_rmdir(ap)
+ struct vop_rmdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp;
+ vnode_t *vp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ ZFS_ENTER(zfsvfs);
+top:
+ zp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vp = ZTOV(zp);
+ ASSERT(vp == ap->a_vp);
+ ASSERT(vp->v_type == VDIR);
+ /* Drop an extra reference from zfs_dirent_lock(). */
+ VN_RELE(vp);
+ ASSERT(vp->v_count > 0);
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+ goto out;
+ }
+
+ /*
+ * Grab a lock on the directory to make sure that noone is
+ * trying to add (or lookup) entries while we are removing it.
+ */
+ rw_enter(&zp->z_name_lock, RW_WRITER);
+
+ /*
+ * Grab a lock on the parent pointer to make sure we play well
+ * with the treewalk and directory rename code.
+ */
+ rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+#ifdef FREEBSD_NAMECACHE
+ cache_purge(dvp);
+#endif
+
+ error = zfs_link_destroy(dl, zp, tx, 0, NULL);
+
+ if (error == 0)
+ zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
+
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+#ifdef FREEBSD_NAMECACHE
+ cache_purge(vp);
+#endif
+out:
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure.
+ *
+ * IN: vp - vnode of directory to read.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ * eofp - set to true if end-of-file detected.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *ncookies;
+ u_long **a_cookies;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ znode_t *zp = VTOZ(vp);
+ iovec_t *iovp;
+ dirent64_t *odp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os;
+ caddr_t outbuf;
+ size_t bufsize;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ uint_t bytes_wanted;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+ int local_eof;
+ int outcount;
+ int error;
+ uint8_t prefetch;
+ uint8_t type;
+ int ncookies;
+ u_long *cookies = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * If we are not given an eof variable,
+ * use a local one.
+ */
+ if (eofp == NULL)
+ eofp = &local_eof;
+
+ /*
+ * Check for valid iov_len.
+ */
+ if (uio->uio_iov->iov_len <= 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if ((*eofp = zp->z_unlinked) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = uio->uio_loffset;
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Get space to change directory entries into fs independent format.
+ */
+ iovp = uio->uio_iov;
+ bytes_wanted = iovp->iov_len;
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+ bufsize = bytes_wanted;
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+ odp = (struct dirent64 *)outbuf;
+ } else {
+ bufsize = bytes_wanted;
+ odp = (struct dirent64 *)iovp->iov_base;
+ }
+
+ if (ap->a_ncookies) {
+ /*
+ * Minimum entry size is dirent size and 1 byte for a file name.
+ */
+ ncookies = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
+ cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
+ *ap->a_cookies = cookies;
+ *ap->a_ncookies = ncookies;
+ }
+
+ /*
+ * Transform to file-system independent format
+ */
+ outcount = 0;
+ while (outcount < bytes_wanted) {
+ ino64_t objnum;
+ ushort_t reclen;
+
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ objnum = zp->z_id;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ objnum = zp->z_phys->zp_parent;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ objnum = ZFSCTL_INO_ROOT;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if (error = zap_cursor_retrieve(&zc, &zap)) {
+ if ((*eofp = (error == ENOENT)) != 0)
+ break;
+ else
+ goto update;
+ }
+
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers != 1) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset);
+ error = ENXIO;
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ /*
+ * MacOS X can extract the object type here such as:
+ * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ */
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ }
+ reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+ /*
+ * Will this entry fit in the buffer?
+ */
+ if (outcount + reclen > bufsize) {
+ /*
+ * Did we manage to fit anything in the buffer?
+ */
+ if (!outcount) {
+ error = EINVAL;
+ goto update;
+ }
+ break;
+ }
+ /*
+ * Add this entry:
+ */
+ odp->d_ino = objnum;
+ odp->d_reclen = reclen;
+ odp->d_namlen = strlen(zap.za_name);
+ (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+ odp->d_type = type;
+ outcount += reclen;
+ odp = (dirent64_t *)((intptr_t)odp + reclen);
+
+ ASSERT(outcount <= bufsize);
+
+ /* Prefetch znode */
+ if (prefetch)
+ dmu_prefetch(os, objnum, 0, 0);
+
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+
+ if (cookies != NULL) {
+ *cookies++ = offset;
+ ncookies--;
+ KASSERT(ncookies >= 0, ("ncookies=%d", ncookies));
+ }
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+ /* Subtract unused cookies */
+ if (ap->a_ncookies)
+ *ap->a_ncookies -= ncookies;
+
+ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+ iovp->iov_base += outcount;
+ iovp->iov_len -= outcount;
+ uio->uio_resid -= outcount;
+ } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+ /*
+ * Reset the pointer.
+ */
+ offset = uio->uio_loffset;
+ }
+
+update:
+ zap_cursor_fini(&zc);
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+ kmem_free(outbuf, bufsize);
+
+ if (error == ENOENT)
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ uio->uio_loffset = offset;
+ ZFS_EXIT(zfsvfs);
+ if (error != 0) {
+ free(*ap->a_cookies, M_TEMP);
+ *ap->a_cookies = NULL;
+ *ap->a_ncookies = 0;
+ }
+ return (error);
+}
+
+static int
+zfs_fsync(struct vop_fsync_args *ap)
+{
+ znode_t *zp = VTOZ(ap->a_vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ vop_stdfsync(ap);
+
+ ZFS_ENTER(zfsvfs);
+ zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: vp - vnode of file.
+ * vap - va_mask identifies requested attributes.
+ * flags - [UNUSED]
+ * cr - credentials of caller.
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds)
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cred;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_phys_t *pzp = zp->z_phys;
+ uint32_t blksize;
+ u_longlong_t nblocks;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+ mutex_enter(&zp->z_lock);
+
+ vap->va_type = IFTOVT(pzp->zp_mode);
+ vap->va_mode = pzp->zp_mode & ~S_IFMT;
+ vap->va_uid = zp->z_phys->zp_uid;
+ vap->va_gid = zp->z_phys->zp_gid;
+ vap->va_nodeid = zp->z_id;
+ vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */
+ vap->va_size = pzp->zp_size;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_seq = zp->z_seq;
+ vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
+
+ ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
+
+ /*
+ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+ * Also, if we are the owner don't bother, since owner should
+ * always be allowed to read basic attributes of file.
+ */
+ if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
+ (zp->z_phys->zp_uid != crgetuid(cr))) {
+ if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
+ mutex_exit(&zp->z_lock);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
+ vap->va_blksize = blksize;
+ vap->va_bytes = nblocks << 9; /* nblocks * 512 */
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: vp - vnode of file to be modified.
+ * vap - new attribute values.
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+static int
+zfs_setattr(ap)
+ struct vop_setattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cred;
+ znode_t *zp = VTOZ(vp);
+ znode_phys_t *pzp = zp->z_phys;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ uint_t mask;
+ uint_t saved_mask;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int flags = 0;
+ int err;
+
+ /* No support for FreeBSD's chflags(2). */
+ if (vap->va_flags != VNOVAL)
+ return (EOPNOTSUPP);
+
+ vattr_init_mask(vap);
+ mask = vap->va_mask;
+
+ if (mask == 0)
+ return (0);
+
+ if (mask & AT_SIZE && vp->v_type == VDIR)
+ return (EISDIR);
+
+ if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ attrzp = NULL;
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (EROFS);
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & AT_SIZE) {
+ err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ do {
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ /* NB: we already did dmu_tx_wait() if necessary */
+ } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ if (mask & (AT_ATIME|AT_MTIME))
+ need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
+
+ if (mask & (AT_UID|AT_GID)) {
+ int idmask = (mask & (AT_UID|AT_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & AT_MODE))
+ vap->va_mode = pzp->zp_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
+
+ /*
+ * If both AT_UID and AT_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+ ((idmask == AT_UID) && take_owner) ||
+ ((idmask == AT_GID) && take_group)) {
+ if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ secpolicy_setid_clear(vap, cr);
+ trim_mask = (mask & (AT_UID|AT_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ mutex_enter(&zp->z_lock);
+ oldva.va_mode = pzp->zp_mode;
+ oldva.va_uid = zp->z_phys->zp_uid;
+ oldva.va_gid = zp->z_phys->zp_gid;
+ mutex_exit(&zp->z_lock);
+
+ if (mask & AT_MODE) {
+ if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(vp, vap,
+ &oldva, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ trim_mask |= AT_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+
+ }
+ err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ if (trim_mask)
+ vap->va_mask |= saved_mask;
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = pzp->zp_mode;
+
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj)
+ dmu_tx_hold_write(tx,
+ pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
+ else
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
+ }
+
+ if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
+ err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
+ if (err) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ dmu_tx_hold_bonus(tx, attrzp->z_id);
+ }
+
+ err = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (err) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
+ if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ mutex_enter(&zp->z_lock);
+
+ if (mask & AT_MODE) {
+ err = zfs_acl_chmod_setattr(zp, new_mode, tx);
+ ASSERT3U(err, ==, 0);
+ }
+
+ if (attrzp)
+ mutex_enter(&attrzp->z_lock);
+
+ if (mask & AT_UID) {
+ zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+ if (attrzp) {
+ attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+ if (attrzp)
+ attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+ }
+
+ if (attrzp)
+ mutex_exit(&attrzp->z_lock);
+
+ if (mask & AT_ATIME)
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+
+ if (mask & AT_MTIME)
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+
+ if (mask & AT_SIZE)
+ zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
+ else if (mask != 0)
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
+
+ mutex_exit(&zp->z_lock);
+
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
+
+ dmu_tx_commit(tx);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+typedef struct zfs_zlock {
+ krwlock_t *zl_rwlock; /* lock we acquired */
+ znode_t *zl_znode; /* znode we held */
+ struct zfs_zlock *zl_next; /* next in list */
+} zfs_zlock_t;
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+
+ while ((zl = *zlpp) != NULL) {
+ if (zl->zl_znode != NULL)
+ VN_RELE(ZTOV(zl->zl_znode));
+ rw_exit(zl->zl_rwlock);
+ *zlpp = zl->zl_next;
+ kmem_free(zl, sizeof (*zl));
+ }
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+ znode_t *zp = tdzp;
+ uint64_t rootid = zp->z_zfsvfs->z_root;
+ uint64_t *oidp = &zp->z_id;
+ krwlock_t *rwlp = &szp->z_parent_lock;
+ krw_t rw = RW_WRITER;
+
+ /*
+ * First pass write-locks szp and compares to zp->z_id.
+ * Later passes read-lock zp and compare to zp->z_parent.
+ */
+ do {
+ if (!rw_tryenter(rwlp, rw)) {
+ /*
+ * Another thread is renaming in this path.
+ * Note that if we are a WRITER, we don't have any
+ * parent_locks held yet.
+ */
+ if (rw == RW_READER && zp->z_id > szp->z_id) {
+ /*
+ * Drop our locks and restart
+ */
+ zfs_rename_unlock(&zl);
+ *zlpp = NULL;
+ zp = tdzp;
+ oidp = &zp->z_id;
+ rwlp = &szp->z_parent_lock;
+ rw = RW_WRITER;
+ continue;
+ } else {
+ /*
+ * Wait for other thread to drop its locks
+ */
+ rw_enter(rwlp, rw);
+ }
+ }
+
+ zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+ zl->zl_rwlock = rwlp;
+ zl->zl_znode = NULL;
+ zl->zl_next = *zlpp;
+ *zlpp = zl;
+
+ if (*oidp == szp->z_id) /* We're a descendant of szp */
+ return (EINVAL);
+
+ if (*oidp == rootid) /* We've hit the top */
+ return (0);
+
+ if (rw == RW_READER) { /* i.e. not the first pass */
+ int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+ if (error)
+ return (error);
+ zl->zl_znode = zp;
+ }
+ oidp = &zp->z_phys->zp_parent;
+ rwlp = &zp->z_parent_lock;
+ rw = RW_READER;
+
+ } while (zp->z_id != sdzp->z_id);
+
+ return (0);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdvp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdvp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * sdvp,tdvp - ctime|mtime updated
+ */
+static int
+zfs_rename(ap)
+ struct vop_rename_args /* {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+ } */ *ap;
+{
+ vnode_t *svp = ap->a_fvp;
+ vnode_t *tvp = ap->a_tvp;
+ vnode_t *sdvp = ap->a_fdvp;
+ vnode_t *tdvp = ap->a_tdvp;
+ char *snm = ap->a_fcnp->cn_nameptr;
+ char *tnm = ap->a_tcnp->cn_nameptr;
+ cred_t *cr = ap->a_fcnp->cn_cred;
+ znode_t *tdzp, *szp, *tzp;
+ znode_t *sdzp = VTOZ(sdvp);
+ zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *sdl, *tdl;
+ dmu_tx_t *tx;
+ zfs_zlock_t *zl;
+ int serr, terr, error;
+
+ ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
+ ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (tdvp->v_vfsp != sdvp->v_vfsp) {
+ error = EXDEV;
+abort:
+ ZFS_EXIT(zfsvfs);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ vrele(sdvp);
+ vrele(svp);
+ return (error);
+ }
+
+ tdzp = VTOZ(tdvp);
+top:
+ szp = NULL;
+ tzp = NULL;
+ zl = NULL;
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
+ (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+ error = EINVAL;
+ goto abort;
+ }
+
+ /*
+ * POSIX: "If the old argument and the new argument
+ * both refer to links to the same existing file,
+ * the rename() function shall return successfully
+ * and perform no other action."
+ * This case is already handled in kern_rename().
+ */
+ ASSERT(svp != tvp);
+
+ /*
+ * Lock source and target directory entries. To prevent deadlock,
+ * a lock ordering must be defined. We lock the directory with
+ * the smallest object id first, or if it's a tie, the one with
+ * the lexically first name.
+ */
+ if (sdzp->z_id < tdzp->z_id) {
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+ terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+ } else /* if (sdzp->z_id > tdzp->z_id) */ {
+ terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+ }
+ if (!terr && tzp != NULL) {
+ if (tvp)
+ vrele(tvp);
+ else {
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ tvp = ZTOV(tzp);
+ vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ }
+ ASSERT(tvp->v_count > 0);
+ }
+
+ if (serr || svp != ZTOV(szp)) {
+ /*
+ * Source entry invalid or not there.
+ */
+ if (!serr)
+ zfs_dirent_unlock(sdl);
+ if (!terr)
+ zfs_dirent_unlock(tdl);
+ if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
+ serr = EINVAL;
+ error = serr;
+ goto abort;
+ }
+ if (terr) {
+ zfs_dirent_unlock(sdl);
+ if (strcmp(tnm, "..") == 0)
+ terr = EINVAL;
+ error = terr;
+ goto abort;
+ }
+ /* Remove an extra reference from zfs_dirent_lock(). */
+ vrele(svp);
+ ASSERT(svp->v_count > 0);
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+
+ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
+ goto out;
+
+ if (svp->v_type == VDIR) {
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
+ goto out;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if (svp->v_type == VDIR) {
+ if (tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ } else {
+ if (tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ /*
+ * POSIX dictates that when the source and target
+ * entries refer to the same file object, rename
+ * must do nothing and exit without error.
+ */
+ if (szp->z_id == tzp->z_id) {
+ error = 0;
+ goto out;
+ }
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
+ dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp)
+ dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
+ if (tzp)
+ dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ if (ap->a_tvp == NULL && tvp != NULL) {
+ vput(tvp);
+ tvp = NULL;
+ }
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ goto abort;
+ }
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ if (error == 0) {
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ ASSERT(error == 0);
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
+ }
+#ifdef FREEBSD_NAMECACHE
+ if (error == 0) {
+ cache_purge(sdvp);
+ cache_purge(tdvp);
+ }
+#endif
+ }
+
+ dmu_tx_commit(tx);
+out:
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ if (tzp)
+ vput(tvp);
+ vput(tdvp);
+ vrele(sdvp);
+ vrele(svp);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dvp - Directory to contain new symbolic link.
+ * link - Name for new symlink entry.
+ * vap - Attributes of new entry.
+ * target - Target path of new symlink.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+static int
+zfs_symlink(ap)
+ struct vop_symlink_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ char *link = ap->a_target;
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t zoid;
+ int len = strlen(link);
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
+ vattr_init_mask(vap);
+
+ ZFS_ENTER(zfsvfs);
+top:
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (ENAMETOOLONG);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+
+ /*
+ * Create a new object for the symlink.
+ * Put the link content into bonus buffer if it will fit;
+ * otherwise, store it just like any other file data.
+ */
+ zoid = 0;
+ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
+ if (len != 0)
+ bcopy(link, zp->z_phys + 1, len);
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+ /*
+ * Nothing can access the znode yet so no locking needed
+ * for growing the znode's blocksize.
+ */
+ zfs_grow_blocksize(zp, len, tx);
+
+ VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ zp->z_phys->zp_size = len;
+
+ /*
+ * Insert the new object into the directory.
+ */
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+
+ if (error == 0) {
+ zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
+
+ *ap->a_vpp = ZTOV(zp);
+
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY,
+ ap->a_cnp->cn_thread);
+ }
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ * IN: vp - vnode of symbolic link.
+ * uoip - structure to contain the link path.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - structure to contain the link path.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(ap)
+ struct vop_readlink_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ size_t bufsz;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ bufsz = (size_t)zp->z_phys->zp_size;
+ if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
+ error = uiomove(zp->z_phys + 1,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ * IN: tdvp - Directory to contain new entry.
+ * svp - vnode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * tdvp - ctime|mtime updated
+ * svp - ctime updated
+ */
+/* ARGSUSED */
+static int
+zfs_link(ap)
+ struct vop_link_args /* {
+ struct vnode *a_tdvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *svp = ap->a_vp;
+ vnode_t *tdvp = ap->a_tdvp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *dzp = VTOZ(tdvp);
+ znode_t *tzp, *szp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+ ASSERT(tdvp->v_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (svp->v_vfsp != tdvp->v_vfsp) {
+ ZFS_EXIT(zfsvfs);
+ return (EXDEV);
+ }
+
+ szp = VTOZ(svp);
+top:
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
+ (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (svp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
+ secpolicy_basic_link(cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, szp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_create(dl, szp, tx, 0);
+
+ if (error == 0)
+ zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ /*
+ * Destroy the vm object and flush associated pages.
+ */
+ vnode_destroy_vobject(vp);
+
+ rw_enter(&zfsvfs->z_um_lock, RW_READER);
+ if (zfsvfs->z_unmounted2) {
+ ASSERT(zp->z_dbuf_held == 0);
+
+ mutex_enter(&zp->z_lock);
+ VI_LOCK(vp);
+ vp->v_count = 0; /* count arrives as 1 */
+ VI_UNLOCK(vp);
+ if (zp->z_dbuf == NULL) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_free(zp);
+ } else {
+ mutex_exit(&zp->z_lock);
+ }
+ rw_exit(&zfsvfs->z_um_lock);
+ VFS_RELE(zfsvfs->z_vfs);
+ return (0);
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ mutex_enter(&zp->z_lock);
+ zp->z_atime_dirty = 0;
+ mutex_exit(&zp->z_lock);
+ dmu_tx_commit(tx);
+ }
+ }
+
+ zfs_zinactive(zp);
+ rw_exit(&zfsvfs->z_um_lock);
+ return (0);
+}
+
+static int
+zfs_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+
+ if (zp != NULL)
+ mutex_enter(&zp->z_lock);
+#if 0 /*
+ * We do it from zfs_inactive(), because after zfs_inactive() we can't
+ * VOP_WRITE() to the vnode.
+ */
+ /*
+ * Destroy the vm object and flush associated pages.
+ */
+ vnode_destroy_vobject(vp);
+#endif
+ VI_LOCK(vp);
+ vp->v_data = NULL;
+ if (zp != NULL)
+ zp->z_vnode = NULL;
+ ASSERT(vp->v_holdcnt > 1);
+ vdropl(vp);
+ if (zp != NULL)
+ mutex_exit(&zp->z_lock);
+ return (0);
+}
+
+CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
+CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
+
+static int
+zfs_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint32_t gen = (uint32_t)zp->z_phys->zp_gen;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i;
+
+ ZFS_ENTER(zfsvfs);
+
+ size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+ fidp->fid_len = size;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ if (size == LONG_FID_LEN) {
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ zfid_long_t *zlfid;
+
+ zlfid = (zfid_long_t *)fidp;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ /* XXX - this should be the generation number for the objset */
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_pathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+ int cmd = ap->a_name;
+ int *valp = ap->a_retval;
+#if 0
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp, *xzp;
+ zfsvfs_t *zfsvfs;
+ zfs_dirlock_t *dl;
+ int error;
+#endif
+
+ switch (cmd) {
+ case _PC_LINK_MAX:
+ *valp = INT_MAX;
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *valp = 64;
+ return (0);
+
+#if 0
+ case _PC_XATTR_EXISTS:
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ *valp = 0;
+ error = zfs_dirent_lock(&dl, zp, "", &xzp,
+ ZXATTR | ZEXISTS | ZSHARED);
+ if (error == 0) {
+ zfs_dirent_unlock(dl);
+ if (!zfs_dirempty(xzp))
+ *valp = 1;
+ VN_RELE(ZTOV(xzp));
+ } else if (error == ENOENT) {
+ /*
+ * If there aren't extended attributes, it's the
+ * same as having zero of them.
+ */
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+#endif
+
+ case _PC_ACL_EXTENDED:
+ *valp = 0; /* TODO */
+ return (0);
+
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+#ifdef TODO
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_getacl(zp, vsecp, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+#endif /* TODO */
+
+#ifdef TODO
+/*ARGSUSED*/
+static int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_setacl(zp, vsecp, cr);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* TODO */
+
+/*
+ * Advisory record locking support
+ */
+static int
+zfs_advlock(ap)
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ caddr_t a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap;
+{
+ znode_t *zp = VTOZ(ap->a_vp);
+
+ return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
+}
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+
+struct vop_vector zfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfs_inactive,
+ .vop_reclaim = zfs_reclaim,
+ .vop_access = zfs_access,
+#ifdef FREEBSD_NAMECACHE
+ .vop_lookup = vfs_cache_lookup,
+ .vop_cachedlookup = zfs_lookup,
+#else
+ .vop_lookup = zfs_lookup,
+#endif
+ .vop_getattr = zfs_getattr,
+ .vop_setattr = zfs_setattr,
+ .vop_create = zfs_create,
+ .vop_mknod = zfs_create,
+ .vop_mkdir = zfs_mkdir,
+ .vop_readdir = zfs_readdir,
+ .vop_fsync = zfs_fsync,
+ .vop_open = zfs_open,
+ .vop_close = zfs_close,
+ .vop_rmdir = zfs_rmdir,
+ .vop_ioctl = zfs_ioctl,
+ .vop_link = zfs_link,
+ .vop_symlink = zfs_symlink,
+ .vop_readlink = zfs_readlink,
+ .vop_read = zfs_read,
+ .vop_write = zfs_write,
+ .vop_remove = zfs_remove,
+ .vop_rename = zfs_rename,
+ .vop_advlock = zfs_advlock,
+ .vop_pathconf = zfs_pathconf,
+ .vop_bmap = VOP_EOPNOTSUPP,
+ .vop_fid = zfs_fid,
+};
+
+struct vop_vector zfs_fifoops = {
+ .vop_default = &fifo_specops,
+ .vop_fsync = VOP_PANIC,
+ .vop_access = zfs_access,
+ .vop_getattr = zfs_getattr,
+ .vop_inactive = zfs_inactive,
+ .vop_read = VOP_PANIC,
+ .vop_reclaim = zfs_reclaim,
+ .vop_setattr = zfs_setattr,
+ .vop_write = VOP_PANIC,
+ .vop_fid = zfs_fid,
+};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
new file mode 100644
index 000000000000..d2806b9af279
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -0,0 +1,1061 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/fs/zfs.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/refcount.h>
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+struct kmem_cache *znode_cache = NULL;
+
+/*ARGSUSED*/
+static void
+znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
+{
+ znode_t *zp = user_ptr;
+ vnode_t *vp = ZTOV(zp);
+
+ mutex_enter(&zp->z_lock);
+ if (vp == NULL) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_free(zp);
+ } else if (vp->v_count == 0) {
+ ZTOV(zp) = NULL;
+ mutex_exit(&zp->z_lock);
+ vhold(vp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ vrecycle(vp, curthread);
+ VOP_UNLOCK(vp, 0, curthread);
+ vdrop(vp);
+ zfs_znode_free(zp);
+ } else {
+ /* signal force unmount that this znode can be freed */
+ zp->z_dbuf = NULL;
+ mutex_exit(&zp->z_lock);
+ }
+}
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+
+/*
+ * XXX: We cannot use this function as a cache constructor, because
+ * there is one global cache for all file systems and we need
+ * to pass vfsp here, which is not possible, because argument
+ * 'cdrarg' is defined at kmem_cache_create() time.
+ */
+static int
+zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ znode_t *zp = buf;
+ vfs_t *vfsp = cdrarg;
+ int error;
+
+ if (cdrarg != NULL) {
+ error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &zp->z_vnode);
+ ASSERT(error == 0);
+ zp->z_vnode->v_data = (caddr_t)zp;
+ vhold(zp->z_vnode);
+ } else {
+ zp->z_vnode = NULL;
+ }
+ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&zp->z_range_avl, zfs_range_compare,
+ sizeof (rl_t), offsetof(rl_t, r_node));
+
+ zp->z_dbuf_held = 0;
+ zp->z_dirlocks = 0;
+ zp->z_lockf = NULL;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *cdarg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(zp->z_dirlocks == 0);
+ mutex_destroy(&zp->z_lock);
+ rw_destroy(&zp->z_map_lock);
+ rw_destroy(&zp->z_parent_lock);
+ rw_destroy(&zp->z_name_lock);
+ mutex_destroy(&zp->z_acl_lock);
+ mutex_destroy(&zp->z_range_lock);
+ avl_destroy(&zp->z_range_avl);
+
+ ASSERT(zp->z_dbuf_held == 0);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+}
+
+/*
+ * zfs_init_fs - Initialize the zfsvfs struct and the file system
+ * incore "master" object. Verify version compatibility.
+ */
+int
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+{
+ objset_t *os = zfsvfs->z_os;
+ uint64_t version = ZPL_VERSION;
+ int i, error;
+ dmu_object_info_t doi;
+ uint64_t fsid_guid;
+
+ *zpp = NULL;
+
+ /*
+ * XXX - hack to auto-create the pool root filesystem at
+ * the first attempted mount.
+ */
+ if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ ASSERT3U(error, ==, 0);
+ zfs_create_fs(os, cr, tx);
+ dmu_tx_commit(tx);
+ }
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
+ &version);
+ if (error) {
+ return (error);
+ } else if (version != ZPL_VERSION) {
+ (void) printf("Mismatched versions: File system "
+ "is version %lld on-disk format, which is "
+ "incompatible with this software version %lld!",
+ (u_longlong_t)version, ZPL_VERSION);
+ return (ENOTSUP);
+ }
+
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
+ zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ /*
+ * Create the per mount vop tables.
+ */
+
+ /*
+ * Initialize zget mutex's
+ */
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
+ if (error)
+ return (error);
+ ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define NBITSMINOR64 32
+#endif
+#ifndef MAXMAJ64
+#define MAXMAJ64 0xffffffffUL
+#endif
+#ifndef MAXMIN64
+#define MAXMIN64 0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want. The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+ return ((uint64_t)0);
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64. We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+ return ((dev_t)0);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
+{
+ znode_t *zp;
+ vnode_t *vp;
+ int error;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
+
+ ASSERT(zp->z_dirlocks == NULL);
+
+ zp->z_phys = db->db_data;
+ zp->z_zfsvfs = zfsvfs;
+ zp->z_unlinked = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_dbuf_held = 0;
+ zp->z_mapcnt = 0;
+ zp->z_last_itx = 0;
+ zp->z_dbuf = db;
+ zp->z_id = obj_num;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ vp = ZTOV(zp);
+ if (vp == NULL)
+ return (zp);
+
+ error = insmntque(vp, zfsvfs->z_vfs);
+ KASSERT(error == 0, ("insmntque() failed: error %d", error));
+
+ vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+ switch (vp->v_type) {
+ case VDIR:
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ break;
+ case VFIFO:
+ vp->v_op = &zfs_fifoops;
+ break;
+ }
+
+ return (zp);
+}
+
+static void
+zfs_znode_dmu_init(znode_t *zp)
+{
+ znode_t *nzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_buf_t *db = zp->z_dbuf;
+
+ mutex_enter(&zp->z_lock);
+
+ nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
+
+ /*
+ * there should be no
+ * concurrent zgets on this object.
+ */
+ ASSERT3P(nzp, ==, NULL);
+
+ /*
+ * Slap on VROOT if we are the root znode
+ */
+ if (zp->z_id == zfsvfs->z_root) {
+ ZTOV(zp)->v_flag |= VROOT;
+ }
+
+ ASSERT(zp->z_dbuf_held == 0);
+ zp->z_dbuf_held = 1;
+ VFS_HOLD(zfsvfs->z_vfs);
+ mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * IS_REPLAY - intent log replay
+ *
+ * OUT: oid - ID of created object
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, int bonuslen)
+{
+ dmu_buf_t *dbp;
+ znode_phys_t *pzp;
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ timestruc_t now;
+ uint64_t gen;
+ int err;
+
+ ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
+ *oid = vap->va_nodeid;
+ flag |= IS_REPLAY;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ } else {
+ *oid = 0;
+ gethrestime(&now);
+ gen = dmu_tx_get_txg(tx);
+ }
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be to needed allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (vap->va_type == VDIR) {
+ if (flag & IS_REPLAY) {
+ err = zap_create_claim(zfsvfs->z_os, *oid,
+ DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ *oid = zap_create(zfsvfs->z_os,
+ DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ } else {
+ if (flag & IS_REPLAY) {
+ err = dmu_object_claim(zfsvfs->z_os, *oid,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ *oid = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ }
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Initialize the znode physical data to zero.
+ */
+ ASSERT(dbp->db_size >= sizeof (znode_phys_t));
+ bzero(dbp->db_data, dbp->db_size);
+ pzp = dbp->db_data;
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_phys = pzp;
+ dzp->z_id = *oid;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ flag |= IS_XATTR;
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ pzp->zp_rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ if (vap->va_type == VDIR) {
+ pzp->zp_size = 2; /* contents ("." and "..") */
+ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ }
+
+ pzp->zp_parent = dzp->z_id;
+ if (flag & IS_XATTR)
+ pzp->zp_flags |= ZFS_XATTR;
+
+ pzp->zp_gen = gen;
+
+ ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
+ ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ }
+
+ pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
+
+ zfs_perm_init(zp, dzp, flag, vap, tx, cr);
+
+ if (zpp) {
+ kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
+
+ mutex_enter(hash_mtx);
+ zfs_znode_dmu_init(zp);
+ mutex_exit(hash_mtx);
+
+ *zpp = zp;
+ } else {
+ if (ZTOV(zp) != NULL)
+ ZTOV(zp)->v_count = 0;
+ dmu_buf_rele(dbp, NULL);
+ zfs_znode_free(zp);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ vnode_t *vp;
+ int err;
+
+ *zpp = NULL;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+
+ ASSERT(db->db_object == obj_num);
+ ASSERT(db->db_offset == -1);
+ ASSERT(db->db_data != NULL);
+
+ zp = dmu_buf_get_user(db);
+
+ if (zp != NULL) {
+ mutex_enter(&zp->z_lock);
+
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_unlinked) {
+ dmu_buf_rele(db, NULL);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (ENOENT);
+ } else if (zp->z_dbuf_held) {
+ dmu_buf_rele(db, NULL);
+ } else {
+ zp->z_dbuf_held = 1;
+ VFS_HOLD(zfsvfs->z_vfs);
+ }
+
+ if (ZTOV(zp) != NULL)
+ VN_HOLD(ZTOV(zp));
+ else {
+ err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
+ &zp->z_vnode);
+ ASSERT(err == 0);
+ vp = ZTOV(zp);
+ vp->v_data = (caddr_t)zp;
+ vhold(vp);
+ vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+ if (vp->v_type == VDIR)
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ err = insmntque(vp, zfsvfs->z_vfs);
+ KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ }
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ *zpp = zp;
+ return (0);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ zfs_znode_dmu_init(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ *zpp = zp;
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+ ASSERT3U(error, ==, 0);
+ }
+ error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
+ ASSERT3U(error, ==, 0);
+ zp->z_dbuf_held = 0;
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ vnode_t *vp = ZTOV(zp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ mutex_enter(&zp->z_lock);
+ VI_LOCK(vp);
+ if (vp->v_count > 0) {
+ /*
+ * If the hold count is greater than zero, somebody has
+ * obtained a new reference on this znode while we were
+ * processing it here, so we are done.
+ */
+ VI_UNLOCK(vp);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ return;
+ }
+ VI_UNLOCK(vp);
+
+ /*
+ * If this was the last reference to a file with no links,
+ * remove the file from the file system.
+ */
+ if (zp->z_unlinked) {
+ ZTOV(zp) = NULL;
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ ASSERT(vp->v_count == 0);
+ vrecycle(vp, curthread);
+ zfs_rmnode(zp);
+ VFS_RELE(zfsvfs->z_vfs);
+ return;
+ }
+ ASSERT(zp->z_phys);
+ ASSERT(zp->z_dbuf_held);
+
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zp->z_lock);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ VFS_RELE(zfsvfs->z_vfs);
+}
+
+/*
+ * FreeBSD: Should be called from ->vop_reclaim().
+ */
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ kmem_cache_free(znode_cache, zp);
+}
+
+void
+zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ gethrestime(&now);
+
+ if (tx) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+
+ if (flag & AT_MTIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+
+ if (flag & AT_CTIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+}
+
+/*
+ * Update the requested znode timestamps with the current time.
+ * If we are in a transaction, then go ahead and mark the znode
+ * dirty in the transaction so the timestamps will go to disk.
+ * Otherwise, we will get pushed next time the znode is updated
+ * in a transaction, or when this znode eventually goes inactive.
+ *
+ * Why is this OK?
+ * 1 - Only the ACCESS time is ever updated outside of a transaction.
+ * 2 - Multiple consecutive updates will be collapsed into a single
+ * znode update by the transaction grouping semantics of the DMU.
+ */
+void
+zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ mutex_enter(&zp->z_lock);
+ zfs_time_stamper_locked(zp, flag, tx);
+ mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+ if (error == ENOTSUP)
+ return;
+ ASSERT3U(error, ==, 0);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free (0 => to EOF).
+ * flag - current file open mode flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ rl_t *rl;
+ uint64_t end = off + len;
+ uint64_t size, new_blksz;
+ int error;
+
+ if (ZTOV(zp)->v_type == VFIFO)
+ return (0);
+
+ /*
+ * If we will change zp_size then lock the whole file,
+ * otherwise just lock the range being freed.
+ */
+ if (len == 0 || off + len > zp->z_phys->zp_size) {
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ } else {
+ rl = zfs_range_lock(zp, off, len, RL_WRITER);
+ /* recheck, in case zp_size changed */
+ if (off + len > zp->z_phys->zp_size) {
+ /* lost race: file size changed, lock whole file */
+ zfs_range_unlock(rl);
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ }
+ }
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ size = zp->z_phys->zp_size;
+ if (len == 0 && size == off) {
+ zfs_range_unlock(rl);
+ return (0);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ new_blksz = 0;
+ if (end > size &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
+ } else if (off < size) {
+ /*
+ * If len == 0, we are truncating the file.
+ */
+ dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ zfs_range_unlock(rl);
+ return (error);
+ }
+
+ if (new_blksz)
+ zfs_grow_blocksize(zp, new_blksz, tx);
+
+ if (end > size || len == 0)
+ zp->z_phys->zp_size = end;
+
+ if (off < size) {
+ objset_t *os = zfsvfs->z_os;
+ uint64_t rlen = len;
+
+ if (len == 0)
+ rlen = -1;
+ else if (end > size)
+ rlen = size - off;
+ VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
+ }
+
+ if (log) {
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+ }
+
+ zfs_range_unlock(rl);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear any mapped pages in the truncated region. This has to
+ * happen outside of the transaction to avoid the possibility of
+ * a deadlock with someone trying to push a page that we are
+ * about to invalidate.
+ */
+ rw_enter(&zp->z_map_lock, RW_WRITER);
+ if (end > size)
+ vnode_pager_setsize(vp, end);
+ else if (len == 0) {
+#if 0
+ error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
+#else
+ error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
+ vnode_pager_setsize(vp, end);
+#endif
+ }
+ rw_exit(&zp->z_map_lock);
+
+ return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
+{
+ zfsvfs_t zfsvfs;
+ uint64_t moid, doid, roid = 0;
+ uint64_t version = ZPL_VERSION;
+ int error;
+ znode_t *rootzp = NULL;
+ vattr_t vattr;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+
+ error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create a delete queue.
+ */
+ doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = UID_ROOT;
+ vattr.va_gid = GID_WHEEL;
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ zfs_znode_cache_constructor(rootzp, NULL, 0);
+ rootzp->z_zfsvfs = &zfsvfs;
+ rootzp->z_unlinked = 0;
+ rootzp->z_atime_dirty = 0;
+ rootzp->z_dbuf_held = 0;
+
+ bzero(&zfsvfs, sizeof (zfsvfs_t));
+
+ zfsvfs.z_os = os;
+ zfsvfs.z_assign = TXG_NOWAIT;
+ zfsvfs.z_parent = &zfsvfs;
+
+ mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
+ ASSERT3U(rootzp->z_id, ==, roid);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
+ ASSERT(error == 0);
+
+ kmem_cache_free(znode_cache, rootzp);
+}
+#endif /* _KERNEL */
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+{
+ dmu_buf_t *db;
+ dmu_object_info_t doi;
+ znode_phys_t *zp;
+ int error;
+
+ if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, FTAG);
+ return (EINVAL);
+ }
+
+ zp = db->db_data;
+ *pobjp = zp->zp_parent;
+ *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
+ S_ISDIR(zp->zp_mode);
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+
+ for (;;) {
+ uint64_t pobj;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir;
+
+ if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj, component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+ return (error);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c
new file mode 100644
index 000000000000..6b52b55f5a05
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -0,0 +1,1607 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * The zfs intent log (ZIL) saves transaction records of system calls
+ * that change the file system in memory with enough information
+ * to be able to replay them. These are stored in memory until
+ * either the DMU transaction group (txg) commits them to the stable pool
+ * and they can be discarded, or they are flushed to the stable log
+ * (also in the pool) due to a fsync, O_DSYNC or other synchronous
+ * requirement. In the event of a panic or power fail then those log
+ * records (transactions) are replayed.
+ *
+ * There is one ZIL per file system. Its on-disk (pool) format consists
+ * of 3 parts:
+ *
+ * - ZIL header
+ * - ZIL blocks
+ * - ZIL records
+ *
+ * A log record holds a system call transaction. Log blocks can
+ * hold many log records and the blocks are chained together.
+ * Each ZIL block contains a block pointer (blkptr_t) to the next
+ * ZIL block in the chain. The ZIL header points to the first
+ * block in the chain. Note there is not a fixed place in the pool
+ * to hold blocks. They are dynamically allocated and freed as
+ * needed from the blocks available. Figure X shows the ZIL structure:
+ */
+
+/*
+ * This global ZIL switch affects all pools
+ */
+int zil_disable = 0; /* disable intent logging */
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RDTUN, &zil_disable, 0,
+ "Disable ZFS Intent Log (ZIL)");
+
+/*
+ * Tunable parameter for debugging or performance analysis. Setting
+ * zfs_nocacheflush will cause corruption on power loss if a volatile
+ * out-of-order write cache is enabled.
+ */
+boolean_t zfs_nocacheflush = B_FALSE;
+TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
+ &zfs_nocacheflush, 0, "Disable cache flush");
+
+static kmem_cache_t *zil_lwb_cache;
+
+static int
+zil_dva_compare(const void *x1, const void *x2)
+{
+ const dva_t *dva1 = x1;
+ const dva_t *dva2 = x2;
+
+ if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
+ return (-1);
+ if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
+ return (1);
+
+ if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
+ return (-1);
+ if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
+ return (1);
+
+ return (0);
+}
+
+static void
+zil_dva_tree_init(avl_tree_t *t)
+{
+ avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
+ offsetof(zil_dva_node_t, zn_node));
+}
+
+static void
+zil_dva_tree_fini(avl_tree_t *t)
+{
+ zil_dva_node_t *zn;
+ void *cookie = NULL;
+
+ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zn, sizeof (zil_dva_node_t));
+
+ avl_destroy(t);
+}
+
+static int
+zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+{
+ zil_dva_node_t *zn;
+ avl_index_t where;
+
+ if (avl_find(t, dva, &where) != NULL)
+ return (EEXIST);
+
+ zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn->zn_dva = *dva;
+ avl_insert(t, zn, where);
+
+ return (0);
+}
+
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+ return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+ zio_cksum_t *zc = &bp->blk_cksum;
+
+ zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+ zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
+/*
+ * Read a log block, make sure it's valid, and byteswap it if necessary.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+{
+ blkptr_t blk = *bp;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_WAIT;
+ int error;
+
+ zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ *abufpp = NULL;
+
+ error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
+ arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+
+ if (error == 0) {
+ char *data = (*abufpp)->b_data;
+ uint64_t blksz = BP_GET_LSIZE(bp);
+ zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
+ zio_cksum_t cksum = bp->blk_cksum;
+
+ /*
+ * Sequence numbers should be... sequential. The checksum
+ * verifier for the next block should be bp's checksum plus 1.
+ */
+ cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
+ error = ESTALE;
+ else if (BP_IS_HOLE(&ztp->zit_next_blk))
+ error = ENOENT;
+ else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
+ error = EOVERFLOW;
+
+ if (error) {
+ VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
+ *abufpp = NULL;
+ }
+ }
+
+ dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+
+ return (error);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ * Return the highest sequence number.
+ */
+uint64_t
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t claim_seq = zh->zh_claim_seq;
+ uint64_t seq = 0;
+ uint64_t max_seq = 0;
+ blkptr_t blk = zh->zh_log;
+ arc_buf_t *abuf;
+ char *lrbuf, *lrp;
+ zil_trailer_t *ztp;
+ int reclen, error;
+
+ if (BP_IS_HOLE(&blk))
+ return (max_seq);
+
+ /*
+ * Starting at the block pointed to by zh_log we read the log chain.
+ * For each block in the chain we strongly check that block to
+ * ensure its validity. We stop when an invalid block is found.
+ * For each block pointer in the chain we call parse_blk_func().
+ * For each record in each valid block we call parse_lr_func().
+ * If the log has been claimed, stop if we encounter a sequence
+ * number greater than the highest claimed sequence number.
+ */
+ zil_dva_tree_init(&zilog->zl_dva_tree);
+ for (;;) {
+ seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ if (claim_seq != 0 && seq > claim_seq)
+ break;
+
+ ASSERT(max_seq < seq);
+ max_seq = seq;
+
+ error = zil_read_log_block(zilog, &blk, &abuf);
+
+ if (parse_blk_func != NULL)
+ parse_blk_func(zilog, &blk, arg, txg);
+
+ if (error)
+ break;
+
+ lrbuf = abuf->b_data;
+ ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+ blk = ztp->zit_next_blk;
+
+ if (parse_lr_func == NULL) {
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ continue;
+ }
+
+ for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ lr_t *lr = (lr_t *)lrp;
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ parse_lr_func(zilog, lr, arg, txg);
+ }
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+ zil_dva_tree_fini(&zilog->zl_dva_tree);
+
+ return (max_seq);
+}
+
+/* ARGSUSED */
+static void
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ spa_t *spa = zilog->zl_spa;
+ int err;
+
+ /*
+ * Claim log block if not already committed and not already claimed.
+ */
+ if (bp->blk_birth >= first_txg &&
+ zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
+ err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+ ASSERT(err == 0);
+ }
+}
+
+static void
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+ zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+}
+
+static void
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+ /*
+ * If we previously claimed it, we need to free it.
+ */
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ if (bp->blk_birth >= claim_txg &&
+ !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
+ (void) arc_free(NULL, zilog->zl_spa,
+ dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
+ }
+ }
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static void
+zil_create(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ uint64_t txg = 0;
+ dmu_tx_t *tx = NULL;
+ blkptr_t blk;
+ int error = 0;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(zh->zh_replay_seq == 0);
+
+ blk = zh->zh_log;
+
+ /*
+ * If we don't already have an initial log block, allocate one now.
+ */
+ if (BP_IS_HOLE(&blk)) {
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
+ NULL, txg);
+
+ if (error == 0)
+ zil_init_log_chain(zilog, &blk);
+ }
+
+ /*
+ * Allocate a log write buffer (lwb) for the first log block.
+ */
+ if (error == 0) {
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = blk;
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
+ lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_zio = NULL;
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+ }
+
+ /*
+ * If we just allocated the first log block, commit our transaction
+ * and wait for zil_sync() to stuff the block poiner into zh_log.
+ * (zh is part of the MOS, so we cannot modify it in open context.)
+ */
+ if (tx != NULL) {
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header.
+ * If keep_first is set, then we're replaying a log with no content.
+ * We want to keep the first block, however, so that the first
+ * synchronous transaction doesn't require a txg_wait_synced()
+ * in zil_create(). We don't need to txg_wait_synced() here either
+ * when keep_first is set, because both zil_create() and zil_destroy()
+ * will wait for any in-progress destroys to complete.
+ */
+void
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return;
+
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT3U(zilog->zl_destroy_txg, <, txg);
+ zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = keep_first;
+
+ if (!list_is_empty(&zilog->zl_lwb_list)) {
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(!keep_first);
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ } else {
+ if (!keep_first) {
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zh->zh_claim_txg);
+ }
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ dmu_tx_commit(tx);
+
+ if (keep_first) /* no need to wait in this case */
+ return;
+
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+}
+
+int
+zil_claim(char *osname, void *txarg)
+{
+ dmu_tx_t *tx = txarg;
+ uint64_t first_txg = dmu_tx_get_txg(tx);
+ zilog_t *zilog;
+ zil_header_t *zh;
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (error) {
+ cmn_err(CE_WARN, "can't process intent log for %s", osname);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zil_header_in_syncing_context(zilog);
+
+ /*
+ * Claim all log blocks if we haven't already done so, and remember
+ * the highest claimed sequence number. This ensures that if we can
+ * read only part of the log now (e.g. due to a missing device),
+ * but we can read the entire log later, we will not try to replay
+ * or destroy beyond the last block we successfully claimed.
+ */
+ ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+ zil_claim_log_record, tx, first_txg);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
+ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+ dmu_objset_close(os);
+ return (0);
+}
+
+void
+zil_add_vdev(zilog_t *zilog, uint64_t vdev)
+{
+ zil_vdev_t *zv, *new;
+ uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
+ uchar_t *cp;
+
+ if (zfs_nocacheflush)
+ return;
+
+ if (vdev < bmap_sz) {
+ cp = zilog->zl_vdev_bmap + (vdev / 8);
+ atomic_or_8(cp, 1 << (vdev % 8));
+ } else {
+ /*
+ * insert into ordered list
+ */
+ mutex_enter(&zilog->zl_lock);
+ for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
+ zv = list_next(&zilog->zl_vdev_list, zv)) {
+ if (zv->vdev == vdev) {
+ /* duplicate found - just return */
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ if (zv->vdev > vdev) {
+ /* insert before this entry */
+ new = kmem_alloc(sizeof (zil_vdev_t),
+ KM_SLEEP);
+ new->vdev = vdev;
+ list_insert_before(&zilog->zl_vdev_list,
+ zv, new);
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
+ /* ran off end of list, insert at the end */
+ ASSERT(zv == NULL);
+ new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+ new->vdev = vdev;
+ list_insert_tail(&zilog->zl_vdev_list, new);
+ mutex_exit(&zilog->zl_lock);
+ }
+}
+
+/* start an async flush of the write cache for this vdev */
+void
+zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
+{
+ vdev_t *vd;
+
+ if (*zio == NULL)
+ *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ vd = vdev_lookup_top(spa, vdev);
+ ASSERT(vd);
+
+ (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+}
+
+void
+zil_flush_vdevs(zilog_t *zilog)
+{
+ zil_vdev_t *zv;
+ zio_t *zio = NULL;
+ spa_t *spa = zilog->zl_spa;
+ uint64_t vdev;
+ uint8_t b;
+ int i, j;
+
+ ASSERT(zilog->zl_writer);
+
+ for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
+ b = zilog->zl_vdev_bmap[i];
+ if (b == 0)
+ continue;
+ for (j = 0; j < 8; j++) {
+ if (b & (1 << j)) {
+ vdev = (i << 3) + j;
+ zil_flush_vdev(spa, vdev, &zio);
+ }
+ }
+ zilog->zl_vdev_bmap[i] = 0;
+ }
+
+ while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+ zil_flush_vdev(spa, zv->vdev, &zio);
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+ }
+ /*
+ * Wait for all the flushes to complete. Not all devices actually
+ * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+ */
+ if (zio)
+ (void) zio_wait(zio);
+}
+
+/*
+ * Function called when a log block write completes
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ zilog_t *zilog = lwb->lwb_zilog;
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ txg_rele_to_sync(&lwb->lwb_txgh);
+
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ mutex_enter(&zilog->zl_lock);
+ lwb->lwb_buf = NULL;
+ if (zio->io_error) {
+ zilog->zl_log_error = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Initialize the io for a log block.
+ *
+ * Note, we should not initialize the IO until we are about
+ * to use it, since zio_rewrite() does a spa_config_enter().
+ */
+static void
+zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
+{
+ zbookmark_t zb;
+
+ zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ if (zilog->zl_root_zio == NULL) {
+ zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
+ if (lwb->lwb_zio == NULL) {
+ lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
+ ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+ lwb->lwb_sz, zil_lwb_write_done, lwb,
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
+}
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *nlwb;
+ zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ spa_t *spa = zilog->zl_spa;
+ blkptr_t *bp = &ztp->zit_next_blk;
+ uint64_t txg;
+ uint64_t zil_blksz;
+ int error;
+
+ ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+
+ /*
+ * Allocate the next block and save its address in this block
+ * before writing it in order to establish the log chain.
+ * Note that if the allocation of nlwb synced before we wrote
+ * the block that points at it (lwb), we'd leak it if we crashed.
+ * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ */
+ txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
+ txg_rele_to_quiesce(&lwb->lwb_txgh);
+
+ /*
+ * Pick a ZIL blocksize. We request a size that is the
+ * maximum of the previous used size, the current used size and
+ * the amount waiting in the queue.
+ */
+ zil_blksz = MAX(zilog->zl_prev_used,
+ zilog->zl_cur_used + sizeof (*ztp));
+ zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
+ zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
+ if (zil_blksz > ZIL_MAX_BLKSZ)
+ zil_blksz = ZIL_MAX_BLKSZ;
+
+ BP_ZERO(bp);
+ /* pass the old blkptr in order to spread log blocks across devs */
+ error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
+ if (error) {
+ dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
+
+ /*
+ * We dirty the dataset to ensure that zil_sync() will
+ * be called to remove this lwb from our zl_lwb_list.
+ * Failing to do so, may leave an lwb with a NULL lwb_buf
+ * hanging around on the zl_lwb_list.
+ */
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ dmu_tx_commit(tx);
+
+ /*
+ * Since we've just experienced an allocation failure so we
+ * terminate the current lwb and send it on its way.
+ */
+ ztp->zit_pad = 0;
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ zio_nowait(lwb->lwb_zio);
+
+ /*
+ * By returning NULL the caller will call tx_wait_synced()
+ */
+ return (NULL);
+ }
+
+ ASSERT3U(bp->blk_birth, ==, txg);
+ ztp->zit_pad = 0;
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ /*
+ * Allocate a new log write buffer (lwb).
+ */
+ nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+
+ nlwb->lwb_zilog = zilog;
+ nlwb->lwb_blk = *bp;
+ nlwb->lwb_nused = 0;
+ nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
+ nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
+ nlwb->lwb_max_txg = txg;
+ nlwb->lwb_zio = NULL;
+
+ /*
+ * Put new lwb at the end of the log chain
+ */
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, nlwb);
+ mutex_exit(&zilog->zl_lock);
+
+ /* Record the vdev for later flushing */
+ zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
+
+ /*
+ * kick off the write for the old log block
+ */
+ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+ ASSERT(lwb->lwb_zio);
+ zio_nowait(lwb->lwb_zio);
+
+ return (nlwb);
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+ lr_t *lrc = &itx->itx_lr; /* common log record */
+ lr_write_t *lr = (lr_write_t *)lrc;
+ uint64_t txg = lrc->lrc_txg;
+ uint64_t reclen = lrc->lrc_reclen;
+ uint64_t dlen;
+
+ if (lwb == NULL)
+ return (NULL);
+ ASSERT(lwb->lwb_buf != NULL);
+
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+ dlen = P2ROUNDUP_TYPED(
+ lr->lr_length, sizeof (uint64_t), uint64_t);
+ else
+ dlen = 0;
+
+ zilog->zl_cur_used += (reclen + dlen);
+
+ zil_lwb_write_init(zilog, lwb);
+
+ /*
+ * If this record won't fit in the current log block, start a new one.
+ */
+ if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ lwb = zil_lwb_write_start(zilog, lwb);
+ if (lwb == NULL)
+ return (NULL);
+ zil_lwb_write_init(zilog, lwb);
+ ASSERT(lwb->lwb_nused == 0);
+ if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ return (lwb);
+ }
+ }
+
+ /*
+ * Update the lrc_seq, to be log record sequence number. See zil.h
+ * Then copy the record to the log buffer.
+ */
+ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+ bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (itx->itx_wr_state != WR_COPIED) {
+ char *dbuf;
+ int error;
+
+ /* alignment is guaranteed */
+ lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
+ if (dlen) {
+ ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+ dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
+ lr->lr_common.lrc_reclen += dlen;
+ } else {
+ ASSERT(itx->itx_wr_state == WR_INDIRECT);
+ dbuf = NULL;
+ }
+ error = zilog->zl_get_data(
+ itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ if (error) {
+ ASSERT(error == ENOENT || error == EEXIST ||
+ error == EALREADY);
+ return (lwb);
+ }
+ }
+ }
+
+ lwb->lwb_nused += reclen + dlen;
+ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+ ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+
+ return (lwb);
+}
+
+itx_t *
+zil_itx_create(int txtype, size_t lrsize)
+{
+ itx_t *itx;
+
+ lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+
+ itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+ itx->itx_lr.lrc_txtype = txtype;
+ itx->itx_lr.lrc_reclen = lrsize;
+ itx->itx_lr.lrc_seq = 0; /* defensive */
+
+ return (itx);
+}
+
+uint64_t
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t seq;
+
+ ASSERT(itx->itx_lr.lrc_seq == 0);
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ return (seq);
+}
+
+/*
+ * Free up all in-memory intent log transactions that have now been synced.
+ */
+static void
+zil_itx_clean(zilog_t *zilog)
+{
+ uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
+ uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ list_t clean_list;
+ itx_t *itx;
+
+ list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
+
+ mutex_enter(&zilog->zl_lock);
+ /* wait for a log writer to finish walking list */
+ while (zilog->zl_writer) {
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ }
+
+ /*
+ * Move the sync'd log transactions to a separate list so we can call
+ * kmem_free without holding the zl_lock.
+ *
+ * There is no need to set zl_writer as we don't drop zl_lock here
+ */
+ while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
+ itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
+ list_remove(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
+ list_insert_tail(&clean_list, itx);
+ }
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+
+ /* destroy sync'd log transactions */
+ while ((itx = list_head(&clean_list)) != NULL) {
+ list_remove(&clean_list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ }
+ list_destroy(&clean_list);
+}
+
+/*
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them.
+ */
+void
+zil_clean(zilog_t *zilog)
+{
+ itx_t *itx;
+
+ mutex_enter(&zilog->zl_lock);
+ itx = list_head(&zilog->zl_itx_list);
+ if ((itx != NULL) &&
+ (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
+ (void) taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+{
+ uint64_t txg;
+ uint64_t reclen;
+ uint64_t commit_seq = 0;
+ itx_t *itx, *itx_next = (itx_t *)-1;
+ lwb_t *lwb;
+ spa_t *spa;
+
+ zilog->zl_writer = B_TRUE;
+ zilog->zl_root_zio = NULL;
+ spa = zilog->zl_spa;
+
+ if (zilog->zl_suspend) {
+ lwb = NULL;
+ } else {
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ /*
+ * Return if there's nothing to flush before we
+ * dirty the fs by calling zil_create()
+ */
+ if (list_is_empty(&zilog->zl_itx_list)) {
+ zilog->zl_writer = B_FALSE;
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
+ zil_create(zilog);
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_tail(&zilog->zl_lwb_list);
+ }
+ }
+
+ /* Loop through in-memory log transactions filling log blocks. */
+ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
+ for (;;) {
+ /*
+ * Find the next itx to push:
+ * Push all transactions related to specified foid and all
+ * other transactions except TX_WRITE, TX_TRUNCATE,
+ * TX_SETATTR and TX_ACL for all other files.
+ */
+ if (itx_next != (itx_t *)-1)
+ itx = itx_next;
+ else
+ itx = list_head(&zilog->zl_itx_list);
+ for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
+ if (foid == 0) /* push all foids? */
+ break;
+ if (itx->itx_sync) /* push all O_[D]SYNC */
+ break;
+ switch (itx->itx_lr.lrc_txtype) {
+ case TX_SETATTR:
+ case TX_WRITE:
+ case TX_TRUNCATE:
+ case TX_ACL:
+ /* lr_foid is same offset for these records */
+ if (((lr_write_t *)&itx->itx_lr)->lr_foid
+ != foid) {
+ continue; /* skip this record */
+ }
+ }
+ break;
+ }
+ if (itx == NULL)
+ break;
+
+ reclen = itx->itx_lr.lrc_reclen;
+ if ((itx->itx_lr.lrc_seq > seq) &&
+ ((lwb == NULL) || (lwb->lwb_nused == 0) ||
+ (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
+ break;
+ }
+
+ /*
+ * Save the next pointer. Even though we soon drop
+ * zl_lock all threads that may change the list
+ * (another writer or zil_itx_clean) can't do so until
+ * they have zl_writer.
+ */
+ itx_next = list_next(&zilog->zl_itx_list, itx);
+ list_remove(&zilog->zl_itx_list, itx);
+ mutex_exit(&zilog->zl_lock);
+ txg = itx->itx_lr.lrc_txg;
+ ASSERT(txg);
+
+ if (txg > spa_last_synced_txg(spa) ||
+ txg > spa_freeze_txg(spa))
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_itx_list_sz -= reclen;
+ }
+ DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
+ /* determine commit sequence number */
+ itx = list_head(&zilog->zl_itx_list);
+ if (itx)
+ commit_seq = itx->itx_lr.lrc_seq;
+ else
+ commit_seq = zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ /* write the last block out */
+ if (lwb != NULL && lwb->lwb_zio != NULL)
+ lwb = zil_lwb_write_start(zilog, lwb);
+
+ zilog->zl_prev_used = zilog->zl_cur_used;
+ zilog->zl_cur_used = 0;
+
+ /*
+ * Wait if necessary for the log blocks to be on stable storage.
+ */
+ if (zilog->zl_root_zio) {
+ DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
+ (void) zio_wait(zilog->zl_root_zio);
+ DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
+ if (!zfs_nocacheflush)
+ zil_flush_vdevs(zilog);
+ }
+
+ if (zilog->zl_log_error || lwb == NULL) {
+ zilog->zl_log_error = 0;
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_writer = B_FALSE;
+
+ ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
+ zilog->zl_commit_seq = commit_seq;
+}
+
+/*
+ * Push zfs transactions to stable storage up to the supplied sequence number.
+ * If foid is 0 push out all transactions, otherwise push only those
+ * for that file or might have been used to create that file.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+{
+ if (zilog == NULL || seq == 0)
+ return;
+
+ mutex_enter(&zilog->zl_lock);
+
+ seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+
+ while (zilog->zl_writer) {
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ if (seq < zilog->zl_commit_seq) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
+ zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
+ /* wake up others waiting on the commit */
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = zilog->zl_spa;
+ lwb_t *lwb;
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT(zilog->zl_stop_sync == 0);
+
+ zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+
+ if (zilog->zl_destroy_txg == txg) {
+ blkptr_t blk = zh->zh_log;
+
+ ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+ ASSERT(spa_sync_pass(spa) == 1);
+
+ bzero(zh, sizeof (zil_header_t));
+ bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+
+ if (zilog->zl_keep_first) {
+ /*
+ * If this block was part of log chain that couldn't
+ * be claimed because a device was missing during
+ * zil_claim(), but that device later returns,
+ * then this block could erroneously appear valid.
+ * To guard against this, assign a new GUID to the new
+ * log chain so it doesn't matter what blk points to.
+ */
+ zil_init_log_chain(zilog, &blk);
+ zh->zh_log = blk;
+ }
+ }
+
+ for (;;) {
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ zh->zh_log = lwb->lwb_blk;
+ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ break;
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_free_blk(spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+
+ /*
+ * If we don't have anything left in the lwb list then
+ * we've had an allocation failure and we need to zero
+ * out the zil_header blkptr so that we don't end
+ * up freeing the same block twice.
+ */
+ if (list_head(&zilog->zl_lwb_list) == NULL)
+ BP_ZERO(&zh->zh_log);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_init(void)
+{
+ zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+ sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+ kmem_cache_destroy(zil_lwb_cache);
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+ zilog_t *zilog;
+
+ zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+ zilog->zl_header = zh_phys;
+ zilog->zl_os = os;
+ zilog->zl_spa = dmu_objset_spa(os);
+ zilog->zl_dmu_pool = dmu_objset_pool(os);
+ zilog->zl_destroy_txg = TXG_INITIAL - 1;
+
+ mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+
+ list_create(&zilog->zl_itx_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
+ list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
+
+ list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
+ offsetof(zil_vdev_t, vdev_seq_node));
+
+ return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ zil_vdev_t *zv;
+
+ zilog->zl_stop_sync = 1;
+
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ list_destroy(&zilog->zl_lwb_list);
+
+ while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+ }
+ list_destroy(&zilog->zl_vdev_list);
+
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+ list_destroy(&zilog->zl_itx_list);
+ cv_destroy(&zilog->zl_cv_suspend);
+ cv_destroy(&zilog->zl_cv_writer);
+ mutex_destroy(&zilog->zl_lock);
+
+ kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * return true if the initial log block is not valid
+ */
+static int
+zil_empty(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ arc_buf_t *abuf = NULL;
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return (1);
+
+ if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
+ return (1);
+
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ return (0);
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ zilog->zl_get_data = get_data;
+ zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+ 2, 2, TASKQ_PREPOPULATE);
+
+ return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+ /*
+ * If the log isn't already committed, mark the objset dirty
+ * (so zil_sync() will be called) and wait for that txg to sync.
+ */
+ if (!zil_is_committed(zilog)) {
+ uint64_t txg;
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ taskq_destroy(zilog->zl_clean_taskq);
+ zilog->zl_clean_taskq = NULL;
+ zilog->zl_get_data = NULL;
+
+ zil_itx_clean(zilog);
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+}
+
+/*
+ * Suspend an intent log. While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * We suspend the log briefly when taking a snapshot so that the snapshot
+ * contains all the data it's supposed to, and has an empty intent log.
+ */
+int
+zil_suspend(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+
+ mutex_enter(&zilog->zl_lock);
+ if (zh->zh_claim_txg != 0) { /* unplayed log */
+ mutex_exit(&zilog->zl_lock);
+ return (EBUSY);
+ }
+ if (zilog->zl_suspend++ != 0) {
+ /*
+ * Someone else already began a suspend.
+ * Just wait for them to finish.
+ */
+ while (zilog->zl_suspending)
+ cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+ mutex_exit(&zilog->zl_lock);
+ return (0);
+ }
+ zilog->zl_suspending = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+
+ zil_commit(zilog, UINT64_MAX, 0);
+
+ /*
+ * Wait for any in-flight log writes to complete.
+ */
+ mutex_enter(&zilog->zl_lock);
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ mutex_exit(&zilog->zl_lock);
+
+ zil_destroy(zilog, B_FALSE);
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+ zilog->zl_suspending = B_FALSE;
+ cv_broadcast(&zilog->zl_cv_suspend);
+ mutex_exit(&zilog->zl_lock);
+
+ return (0);
+}
+
+void
+zil_resume(zilog_t *zilog)
+{
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zilog->zl_suspend != 0);
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+}
+
+typedef struct zil_replay_arg {
+ objset_t *zr_os;
+ zil_replay_func_t **zr_replay;
+ void *zr_arg;
+ uint64_t *zr_txgp;
+ boolean_t zr_byteswap;
+ char *zr_lrbuf;
+} zil_replay_arg_t;
+
+static void
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+ zil_replay_arg_t *zr = zra;
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t reclen = lr->lrc_reclen;
+ uint64_t txtype = lr->lrc_txtype;
+ char *name;
+ int pass, error, sunk;
+
+ if (zilog->zl_stop_replay)
+ return;
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return;
+
+ /*
+ * Make a copy of the data so we can revise and extend it.
+ */
+ bcopy(lr, zr->zr_lrbuf, reclen);
+
+ /*
+ * The log block containing this lr may have been byteswapped
+ * so that we can easily examine common fields like lrc_txtype.
+ * However, the log is a mix of different data types, and only the
+ * replay vectors know how to byteswap their records. Therefore, if
+ * the lr was byteswapped, undo it before invoking the replay vector.
+ */
+ if (zr->zr_byteswap)
+ byteswap_uint64_array(zr->zr_lrbuf, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ lr_write_t *lrw = (lr_write_t *)lr;
+ blkptr_t *wbp = &lrw->lr_blkptr;
+ uint64_t wlen = lrw->lr_length;
+ char *wbuf = zr->zr_lrbuf + reclen;
+
+ if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
+ bzero(wbuf, wlen);
+ } else {
+ /*
+ * A subsequent write may have overwritten this block,
+ * in which case wbp may have been been freed and
+ * reallocated, and our read of wbp may fail with a
+ * checksum error. We can safely ignore this because
+ * the later write will provide the correct data.
+ */
+ zbookmark_t zb;
+
+ zb.zb_objset = dmu_objset_id(zilog->zl_os);
+ zb.zb_object = lrw->lr_foid;
+ zb.zb_level = -1;
+ zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
+
+ (void) zio_wait(zio_read(NULL, zilog->zl_spa,
+ wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
+ (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
+ }
+ }
+
+ /*
+ * We must now do two things atomically: replay this log record,
+ * and update the log header to reflect the fact that we did so.
+ * We use the DMU's ability to assign into a specific txg to do this.
+ */
+ for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
+ uint64_t replay_txg;
+ dmu_tx_t *replay_tx;
+
+ replay_tx = dmu_tx_create(zr->zr_os);
+ error = dmu_tx_assign(replay_tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(replay_tx);
+ break;
+ }
+
+ replay_txg = dmu_tx_get_txg(replay_tx);
+
+ if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+ error = EINVAL;
+ } else {
+ /*
+ * On the first pass, arrange for the replay vector
+ * to fail its dmu_tx_assign(). That's the only way
+ * to ensure that those code paths remain well tested.
+ */
+ *zr->zr_txgp = replay_txg - (pass == 1);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+ zr->zr_byteswap);
+ *zr->zr_txgp = TXG_NOWAIT;
+ }
+
+ if (error == 0) {
+ dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
+ zilog->zl_replay_seq[replay_txg & TXG_MASK] =
+ lr->lrc_seq;
+ }
+
+ dmu_tx_commit(replay_tx);
+
+ if (!error)
+ return;
+
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg
+ * commits, so a subsequent claim can spuriously fail with
+ * EEXIST. So if we receive any error other than ERESTART
+ * we try syncing out any removes then retrying the
+ * transaction.
+ */
+ if (error != ERESTART && !sunk) {
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ sunk = B_TRUE;
+ continue; /* retry */
+ }
+
+ if (error != ERESTART)
+ break;
+
+ if (pass != 1)
+ txg_wait_open(spa_get_dsl(zilog->zl_spa),
+ replay_txg + 1);
+
+ dprintf("pass %d, retrying\n", pass);
+ }
+
+ ASSERT(error && error != ERESTART);
+ name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dmu_objset_name(zr->zr_os, name);
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu\n",
+ error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+ zilog->zl_stop_replay = 1;
+ kmem_free(name, MAXNAMELEN);
+}
+
+/* ARGSUSED */
+static void
+zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zilog->zl_replay_blks++;
+}
+
+/*
+ * If this dataset has a non-empty intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE])
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+ const zil_header_t *zh = zilog->zl_header;
+ zil_replay_arg_t zr;
+
+ if (zil_empty(zilog)) {
+ zil_destroy(zilog, B_TRUE);
+ return;
+ }
+ //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
+
+ zr.zr_os = os;
+ zr.zr_replay = replay_func;
+ zr.zr_arg = arg;
+ zr.zr_txgp = txgp;
+ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
+ zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+ /*
+ * Wait for in-progress removes to sync before starting replay.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zilog->zl_stop_replay = 0;
+ zilog->zl_replay_time = lbolt;
+ ASSERT(zilog->zl_replay_blks == 0);
+ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
+ zh->zh_claim_txg);
+ kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+
+ zil_destroy(zilog, B_FALSE);
+ //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
+}
+
+/*
+ * Report whether all transactions are committed
+ */
+int
+zil_is_committed(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ int ret;
+
+ mutex_enter(&zilog->zl_lock);
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+ /* recent unpushed intent log transactions? */
+ if (!list_is_empty(&zilog->zl_itx_list)) {
+ ret = B_FALSE;
+ goto out;
+ }
+
+ /* intent log never used? */
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ ret = B_TRUE;
+ goto out;
+ }
+
+ /*
+ * more than 1 log buffer means zil_sync() hasn't yet freed
+ * entries after a txg has committed
+ */
+ if (list_next(&zilog->zl_lwb_list, lwb)) {
+ ret = B_FALSE;
+ goto out;
+ }
+
+ ASSERT(zil_empty(zilog));
+ ret = B_TRUE;
+out:
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+ return (ret);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c
new file mode 100644
index 000000000000..6bc4a361646e
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -0,0 +1,1853 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * ==========================================================================
+ * I/O priority table
+ * ==========================================================================
+ */
+uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
+ 0, /* ZIO_PRIORITY_NOW */
+ 0, /* ZIO_PRIORITY_SYNC_READ */
+ 0, /* ZIO_PRIORITY_SYNC_WRITE */
+ 6, /* ZIO_PRIORITY_ASYNC_READ */
+ 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 4, /* ZIO_PRIORITY_FREE */
+ 0, /* ZIO_PRIORITY_CACHE_FILL */
+ 0, /* ZIO_PRIORITY_LOG_WRITE */
+ 10, /* ZIO_PRIORITY_RESILVER */
+ 20, /* ZIO_PRIORITY_SCRUB */
+};
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+char *zio_type_name[ZIO_TYPES] = {
+ "null", "read", "write", "free", "claim", "ioctl" };
+
+/* At or above this size, force gang blocking - for testing */
+uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
+
+/* Force an allocation failure when non-zero */
+uint16_t zio_zil_fail_shift = 0;
+
+typedef struct zio_sync_pass {
+ int zp_defer_free; /* defer frees after this pass */
+ int zp_dontcompress; /* don't compress after this pass */
+ int zp_rewrite; /* rewrite new bps after this pass */
+} zio_sync_pass_t;
+
+zio_sync_pass_t zio_sync_pass = {
+ 1, /* zp_defer_free */
+ 4, /* zp_dontcompress */
+ 1, /* zp_rewrite */
+};
+
+#ifdef ZIO_USE_UMA
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#endif
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+void
+zio_init(void)
+{
+#ifdef ZIO_USE_UMA
+ size_t c;
+#endif
+#if 0
+ vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+ data_alloc_arena = zio_alloc_arena;
+#endif
+#endif
+
+#ifdef ZIO_USE_UMA
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
+ * for each quarter-power of 2. For large buffers, we want
+ * a cache for each multiple of PAGESIZE.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+
+ while (p2 & (p2 - 1))
+ p2 &= p2 - 1;
+
+ if (size <= 4 * SPA_MINBLOCKSIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (P2PHASE(size, PAGESIZE) == 0) {
+ align = PAGESIZE;
+ } else if (P2PHASE(size, p2 >> 2) == 0) {
+ align = p2 >> 2;
+ }
+
+ if (align != 0) {
+ char name[36];
+ (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+
+ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, data_alloc_arena,
+ KMC_NODEBUG);
+
+ dprintf("creating cache for size %5lx align %5lx\n",
+ size, align);
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+ }
+#endif
+
+ zio_inject_init();
+}
+
+void
+zio_fini(void)
+{
+#ifdef ZIO_USE_UMA
+ size_t c;
+ kmem_cache_t *last_cache = NULL;
+ kmem_cache_t *last_data_cache = NULL;
+
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ if (zio_buf_cache[c] != last_cache) {
+ last_cache = zio_buf_cache[c];
+ kmem_cache_destroy(zio_buf_cache[c]);
+ }
+ zio_buf_cache[c] = NULL;
+
+ if (zio_data_buf_cache[c] != last_data_cache) {
+ last_data_cache = zio_data_buf_cache[c];
+ kmem_cache_destroy(zio_data_buf_cache[c]);
+ }
+ zio_data_buf_cache[c] = NULL;
+ }
+#endif
+
+ zio_inject_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
+#else
+ return (kmem_alloc(size, KM_SLEEP));
+#endif
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
+#else
+ return (kmem_alloc(size, KM_SLEEP));
+#endif
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_buf_cache[c], buf);
+#else
+ kmem_free(buf, size);
+#endif
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+#else
+ kmem_free(buf, size);
+#endif
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ zt->zt_data = data;
+ zt->zt_size = size;
+ zt->zt_bufsize = bufsize;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_data = data;
+ zio->io_size = size;
+}
+
+static void
+zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+{
+ zio_transform_t *zt = zio->io_transform_stack;
+
+ *data = zt->zt_data;
+ *size = zt->zt_size;
+ *bufsize = zt->zt_bufsize;
+
+ zio->io_transform_stack = zt->zt_next;
+ kmem_free(zt, sizeof (zio_transform_t));
+
+ if ((zt = zio->io_transform_stack) != NULL) {
+ zio->io_data = zt->zt_data;
+ zio->io_size = zt->zt_size;
+ }
+}
+
+static void
+zio_clear_transform_stack(zio_t *zio)
+{
+ void *data;
+ uint64_t size, bufsize;
+
+ ASSERT(zio->io_transform_stack != NULL);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ while (zio->io_transform_stack != NULL) {
+ zio_buf_free(data, bufsize);
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ }
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ void *data, uint64_t size, zio_done_func_t *done, void *private,
+ zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+
+ zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+ zio->io_parent = pio;
+ zio->io_spa = spa;
+ zio->io_txg = txg;
+ if (bp != NULL) {
+ zio->io_bp = bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
+ }
+ zio->io_done = done;
+ zio->io_private = private;
+ zio->io_type = type;
+ zio->io_priority = priority;
+ zio->io_stage = stage;
+ zio->io_pipeline = pipeline;
+ zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
+ zio->io_timestamp = lbolt64;
+ zio->io_flags = flags;
+ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+ zio_push_transform(zio, data, size, size);
+
+ /*
+ * Note on config lock:
+ *
+ * If CONFIG_HELD is set, then the caller already has the config
+ * lock, so we don't need it for this io.
+ *
+ * We set CONFIG_GRABBED to indicate that we have grabbed the
+ * config lock on behalf of this io, so it should be released
+ * in zio_done.
+ *
+ * Unless CONFIG_HELD is set, we will grab the config lock for
+ * any top-level (parent-less) io, *except* NULL top-level ios.
+ * The NULL top-level ios rarely have any children, so we delay
+ * grabbing the lock until the first child is added (but it is
+ * still grabbed on behalf of the top-level i/o, so additional
+ * children don't need to also grab it). This greatly reduces
+ * contention on the config lock.
+ */
+ if (pio == NULL) {
+ if (type != ZIO_TYPE_NULL &&
+ !(flags & ZIO_FLAG_CONFIG_HELD)) {
+ spa_config_enter(zio->io_spa, RW_READER, zio);
+ zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
+ }
+ zio->io_root = zio;
+ } else {
+ zio->io_root = pio->io_root;
+ if (!(flags & ZIO_FLAG_NOBOOKMARK))
+ zio->io_logical = pio->io_logical;
+ mutex_enter(&pio->io_lock);
+ if (pio->io_parent == NULL &&
+ pio->io_type == ZIO_TYPE_NULL &&
+ !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
+ !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
+ pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
+ spa_config_enter(zio->io_spa, RW_READER, pio);
+ }
+ if (stage < ZIO_STAGE_READY)
+ pio->io_children_notready++;
+ pio->io_children_notdone++;
+ zio->io_sibling_next = pio->io_child;
+ zio->io_sibling_prev = NULL;
+ if (pio->io_child != NULL)
+ pio->io_child->io_sibling_prev = zio;
+ pio->io_child = zio;
+ zio->io_ndvas = pio->io_ndvas;
+ mutex_exit(&pio->io_lock);
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
+ int flags)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+{
+ return (zio_null(NULL, spa, done, private, flags));
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags, zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+
+ zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(csize);
+
+ zio_push_transform(zio, cbuf, csize, csize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+ }
+
+ if (BP_IS_GANG(bp)) {
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
+ checksum < ZIO_CHECKSUM_FUNCTIONS);
+
+ ASSERT(compress >= ZIO_COMPRESS_OFF &&
+ compress < ZIO_COMPRESS_FUNCTIONS);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+
+ zio->io_ready = ready;
+
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
+ zio->io_checksum = checksum;
+ zio->io_compress = compress;
+ zio->io_ndvas = ncopies;
+
+ if (compress != ZIO_COMPRESS_OFF)
+ zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
+
+ if (bp->blk_birth != txg) {
+ /* XXX the bp usually (always?) gets re-zeroed later */
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ } else {
+ /* Make sure someone doesn't change their mind on overwrites */
+ ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+ zio->io_bookmark = *zb;
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ if (pio != NULL)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
+
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ return (zio);
+}
+
+zio_t *
+zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (txg == spa->spa_syncing_txg &&
+ spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+ bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+ return (zio_null(pio, spa, NULL, NULL, 0));
+ }
+
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ /*
+ * A claim is an allocation of a specific block. Claims are needed
+ * to support immediate writes in the intent log. The issue is that
+ * immediate writes contain committed data, but in a txg that was
+ * *not* committed. Upon opening the pool after an unclean shutdown,
+ * the intent log claims all blocks that contain immediate write data
+ * so that the SPA knows they're in use.
+ *
+ * All claims *must* be resolved in the first txg -- before the SPA
+ * starts allocating blocks -- so that nothing is allocated twice.
+ */
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+ ASSERT3U(spa_first_txg(spa), <=, txg);
+
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
+ ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+ int c;
+
+ if (vd->vdev_children == 0) {
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_IOCTL, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_cmd = cmd;
+ } else {
+ zio = zio_null(pio, spa, NULL, NULL, flags);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+ done, private, priority, flags));
+ }
+
+ return (zio);
+}
+
+static void
+zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
+ int checksum)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ ASSERT(size <= SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+ ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ BP_ZERO(bp);
+
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ if (checksum != ZIO_CHECKSUM_OFF)
+ ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_block_tail_t *zbt;
+ void *wbuf;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ zio->io_bp = &zio->io_bp_copy;
+ zio->io_checksum = checksum;
+
+ if (zio_checksum_table[checksum].ci_zbt) {
+ /*
+ * zbt checksums are necessarily destructive -- they modify
+ * one word of the write buffer to hold the verifier/checksum.
+ * Therefore, we must make a local copy in case the data is
+ * being written to multiple places.
+ */
+ wbuf = zio_buf_alloc(size);
+ bcopy(data, wbuf, size);
+ zio_push_transform(zio, wbuf, size, size);
+
+ zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
+ zbt->zbt_cksum = blk.blk_cksum;
+ }
+
+ return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us. It has no associated bp.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+ void *data, uint64_t size, int type, int priority, int flags,
+ zio_done_func_t *done, void *private)
+{
+ uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ zio_t *cio;
+
+ if (type == ZIO_TYPE_READ && bp != NULL) {
+ /*
+ * If we have the bp, then the child should perform the
+ * checksum and the parent need not. This pushes error
+ * detection as close to the leaves as possible and
+ * eliminates redundant checksums in the interior nodes.
+ */
+ pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ }
+
+ cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+ done, private, type, priority,
+ (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+ ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+
+ cio->io_vd = vd;
+ cio->io_offset = offset;
+
+ return (cio);
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+ int error;
+
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+
+ zio->io_waiter = curthread;
+
+ zio_next_stage_async(zio);
+
+ mutex_enter(&zio->io_lock);
+ while (zio->io_stalled != ZIO_STAGE_DONE)
+ cv_wait(&zio->io_cv, &zio->io_lock);
+ mutex_exit(&zio->io_lock);
+
+ error = zio->io_error;
+ cv_destroy(&zio->io_cv);
+ mutex_destroy(&zio->io_lock);
+ kmem_free(zio, sizeof (zio_t));
+
+ return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+ zio_next_stage_async(zio);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * ==========================================================================
+ */
+static void
+zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ mutex_enter(&zio->io_lock);
+ if (*countp == 0) {
+ ASSERT(zio->io_stalled == 0);
+ mutex_exit(&zio->io_lock);
+ zio_next_stage(zio);
+ } else {
+ zio->io_stalled = stage;
+ mutex_exit(&zio->io_lock);
+ }
+}
+
+static void
+zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ zio_t *pio = zio->io_parent;
+
+ mutex_enter(&pio->io_lock);
+ if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ pio->io_error = zio->io_error;
+ if (--*countp == 0 && pio->io_stalled == stage) {
+ pio->io_stalled = 0;
+ mutex_exit(&pio->io_lock);
+ zio_next_stage_async(pio);
+ } else {
+ mutex_exit(&pio->io_lock);
+ }
+}
+
+static void
+zio_wait_children_ready(zio_t *zio)
+{
+ zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ &zio->io_children_notready);
+}
+
+void
+zio_wait_children_done(zio_t *zio)
+{
+ zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ &zio->io_children_notdone);
+}
+
+static void
+zio_ready(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+
+ if (zio->io_ready)
+ zio->io_ready(zio);
+
+ if (pio != NULL)
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ &pio->io_children_notready);
+
+ if (zio->io_bp)
+ zio->io_bp_copy = *zio->io_bp;
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ vdev_t *vd = zio->io_vd;
+
+ ASSERT(zio->io_children_notready == 0);
+ ASSERT(zio->io_children_notdone == 0);
+
+ if (bp != NULL) {
+ ASSERT(bp->blk_pad[0] == 0);
+ ASSERT(bp->blk_pad[1] == 0);
+ ASSERT(bp->blk_pad[2] == 0);
+ ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ if (zio->io_ndvas != 0)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT(BP_COUNT_GANG(bp) == 0 ||
+ (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+ }
+ }
+
+ if (vd != NULL)
+ vdev_stat_update(zio);
+
+ if (zio->io_error) {
+ /*
+ * If this I/O is attached to a particular vdev,
+ * generate an error message describing the I/O failure
+ * at the block level. We ignore these errors if the
+ * device is currently unavailable.
+ */
+ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_IO,
+ zio->io_spa, vd, zio, 0, 0);
+
+ if ((zio->io_error == EIO ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
+ zio->io_logical == zio) {
+ /*
+ * For root I/O requests, tell the SPA to log the error
+ * appropriately. Also, generate a logical data
+ * ereport.
+ */
+ spa_log_error(zio->io_spa, zio);
+
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+ zio->io_spa, NULL, zio, 0, 0);
+ }
+
+ /*
+ * For I/O requests that cannot fail, panic appropriately.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ char *blkbuf;
+
+ blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
+ if (blkbuf) {
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
+ bp ? bp : &zio->io_bp_copy);
+ }
+ panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
+ "%d", zio->io_error == ECKSUM ?
+ "bad checksum" : "I/O failure",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ zio, blkbuf ? blkbuf : "", zio->io_error);
+ }
+ }
+ zio_clear_transform_stack(zio);
+
+ if (zio->io_done)
+ zio->io_done(zio);
+
+ ASSERT(zio->io_delegate_list == NULL);
+ ASSERT(zio->io_delegate_next == NULL);
+
+ if (pio != NULL) {
+ zio_t *next, *prev;
+
+ mutex_enter(&pio->io_lock);
+ next = zio->io_sibling_next;
+ prev = zio->io_sibling_prev;
+ if (next != NULL)
+ next->io_sibling_prev = prev;
+ if (prev != NULL)
+ prev->io_sibling_next = next;
+ if (pio->io_child == zio)
+ pio->io_child = next;
+ mutex_exit(&pio->io_lock);
+
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ &pio->io_children_notdone);
+ }
+
+ /*
+ * Note: this I/O is now done, and will shortly be
+ * kmem_free()'d, so there is no need to clear this (or any
+ * other) flag.
+ */
+ if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
+ spa_config_exit(spa, zio);
+
+ if (zio->io_waiter != NULL) {
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+ zio->io_stalled = zio->io_stage;
+ cv_broadcast(&zio->io_cv);
+ mutex_exit(&zio->io_lock);
+ } else {
+ kmem_free(zio, sizeof (zio_t));
+ }
+}
+
+/*
+ * ==========================================================================
+ * Compression support
+ * ==========================================================================
+ */
+static void
+zio_write_compress(zio_t *zio)
+{
+ int compress = zio->io_compress;
+ blkptr_t *bp = zio->io_bp;
+ void *cbuf;
+ uint64_t lsize = zio->io_size;
+ uint64_t csize = lsize;
+ uint64_t cbufsize = 0;
+ int pass;
+
+ if (bp->blk_birth == zio->io_txg) {
+ /*
+ * We're rewriting an existing block, which means we're
+ * working on behalf of spa_sync(). For spa_sync() to
+ * converge, it must eventually be the case that we don't
+ * have to allocate new blocks. But compression changes
+ * the blocksize, which forces a reallocate, and makes
+ * convergence take longer. Therefore, after the first
+ * few passes, stop compressing to ensure convergence.
+ */
+ pass = spa_sync_pass(zio->io_spa);
+ if (pass > zio_sync_pass.zp_dontcompress)
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT(BP_IS_HOLE(bp));
+ pass = 1;
+ }
+
+ if (compress != ZIO_COMPRESS_OFF)
+ if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+ &cbuf, &csize, &cbufsize))
+ compress = ZIO_COMPRESS_OFF;
+
+ if (compress != ZIO_COMPRESS_OFF && csize != 0)
+ zio_push_transform(zio, cbuf, csize, cbufsize);
+
+ /*
+ * The final pass of spa_sync() must be all rewrites, but the first
+ * few passes offer a trade-off: allocating blocks defers convergence,
+ * but newly allocated blocks are sequential, so they can be written
+ * to disk faster. Therefore, we allow the first few passes of
+ * spa_sync() to reallocate new blocks, but force rewrites after that.
+ * There should only be a handful of blocks after pass 1 in any case.
+ */
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ pass > zio_sync_pass.zp_rewrite) {
+ ASSERT(csize != 0);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_COMPRESS(bp, compress);
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+ } else {
+ if (bp->blk_birth == zio->io_txg)
+ BP_ZERO(bp);
+ if (csize == 0) {
+ BP_ZERO(bp);
+ zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
+ } else {
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, csize);
+ BP_SET_COMPRESS(bp, compress);
+ zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
+ }
+ }
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_read_decompress(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ void *data;
+ uint64_t size;
+ uint64_t bufsize;
+ int compress = BP_GET_COMPRESS(bp);
+
+ ASSERT(compress != ZIO_COMPRESS_OFF);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+
+ if (zio_decompress_data(compress, data, size,
+ zio->io_data, zio->io_size))
+ zio->io_error = EIO;
+
+ zio_buf_free(data, bufsize);
+
+ zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Gang block support
+ * ==========================================================================
+ */
+static void
+zio_gang_pipeline(zio_t *zio)
+{
+ /*
+ * By default, the pipeline assumes that we're dealing with a gang
+ * block. If we're not, strip out any gang-specific stages.
+ */
+ if (!BP_IS_GANG(zio->io_bp))
+ zio->io_pipeline &= ~ZIO_GANG_STAGES;
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_gang_byteswap(zio_t *zio)
+{
+ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+
+ if (BP_SHOULD_BYTESWAP(zio->io_bp))
+ byteswap_uint64_array(zio->io_data, zio->io_size);
+}
+
+static void
+zio_get_gang_header(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ ASSERT(BP_IS_GANG(bp));
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+
+ zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
+ NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+
+ zio_wait_children_done(zio);
+}
+
+static void
+zio_read_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_read(zio, zio->io_spa, gbp,
+ (char *)zio->io_data + loff, lsize, NULL, NULL,
+ zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ &zio->io_bookmark));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_wait_children_done(zio);
+}
+
+static void
+zio_rewrite_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ ASSERT(gsize == gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
+ zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority, zio->io_flags,
+ &zio->io_bookmark));
+ }
+
+ zio_push_transform(zio, gbh, gsize, gbufsize);
+ zio_wait_children_ready(zio);
+}
+
+static void
+zio_free_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_next_stage(zio);
+}
+
+static void
+zio_claim_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_next_stage(zio);
+}
+
+static void
+zio_write_allocate_gang_member_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ dva_t *cdva = zio->io_bp->blk_dva;
+ dva_t *pdva = pio->io_bp->blk_dva;
+ uint64_t asize;
+ int d;
+
+ ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+
+ mutex_enter(&pio->io_lock);
+ for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+ ASSERT(DVA_GET_GANG(&pdva[d]));
+ asize = DVA_GET_ASIZE(&pdva[d]);
+ asize += DVA_GET_ASIZE(&cdva[d]);
+ DVA_SET_ASIZE(&pdva[d], asize);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_allocate_gang_members(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+ zio_gbh_phys_t *gbh;
+ uint64_t txg = zio->io_txg;
+ uint64_t resid = zio->io_size;
+ uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
+ uint64_t gsize, loff, lsize;
+ uint32_t gbps_left;
+ int ndvas = zio->io_ndvas;
+ int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int error;
+ int i, d;
+
+ gsize = SPA_GANGBLOCKSIZE;
+ gbps_left = SPA_GBH_NBLKPTRS;
+
+ error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
+ if (error == ENOSPC)
+ panic("can't allocate gang block header");
+ ASSERT(error == 0);
+
+ for (d = 0; d < gbh_ndvas; d++)
+ DVA_SET_GANG(&dva[d], 1);
+
+ bp->blk_birth = txg;
+
+ gbh = zio_buf_alloc(gsize);
+ bzero(gbh, gsize);
+
+ /* We need to test multi-level gang blocks */
+ if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
+ maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
+
+ for (loff = 0, i = 0; loff != zio->io_size;
+ loff += lsize, resid -= lsize, gbps_left--, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ dva = gbp->blk_dva;
+
+ ASSERT(gbps_left != 0);
+ maxalloc = MIN(maxalloc, resid);
+
+ while (resid <= maxalloc * gbps_left) {
+ error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
+ txg, bp, B_FALSE);
+ if (error == 0)
+ break;
+ ASSERT3U(error, ==, ENOSPC);
+ if (maxalloc == SPA_MINBLOCKSIZE)
+ panic("really out of space");
+ maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
+ }
+
+ if (resid <= maxalloc * gbps_left) {
+ lsize = maxalloc;
+ BP_SET_LSIZE(gbp, lsize);
+ BP_SET_PSIZE(gbp, lsize);
+ BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
+ gbp->blk_birth = txg;
+ zio_nowait(zio_rewrite(zio, spa,
+ zio->io_checksum, txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority, zio->io_flags,
+ &zio->io_bookmark));
+ } else {
+ lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
+ ASSERT(lsize != SPA_MINBLOCKSIZE);
+ zio_nowait(zio_write_allocate(zio, spa,
+ zio->io_checksum, txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority, zio->io_flags));
+ }
+ }
+
+ ASSERT(resid == 0 && loff == zio->io_size);
+
+ zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
+
+ zio_push_transform(zio, gbh, gsize, gsize);
+ /*
+ * As much as we'd like this to be zio_wait_children_ready(),
+ * updating our ASIZE doesn't happen until the io_done callback,
+ * so we have to wait for that to finish in order for our BP
+ * to be stable.
+ */
+ zio_wait_children_done(zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static void
+zio_dva_allocate(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ int error;
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ ASSERT3U(zio->io_ndvas, >, 0);
+ ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
+
+ /* For testing, make some blocks above a certain size be gang blocks */
+ if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
+ zio_write_allocate_gang_members(zio);
+ return;
+ }
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
+ zio->io_txg, NULL, B_FALSE);
+
+ if (error == 0) {
+ bp->blk_birth = zio->io_txg;
+ } else if (error == ENOSPC) {
+ if (zio->io_size == SPA_MINBLOCKSIZE)
+ panic("really, truly out of space");
+ zio_write_allocate_gang_members(zio);
+ return;
+ } else {
+ zio->io_error = error;
+ }
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_free(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
+
+ BP_ZERO(bp);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_claim(zio_t *zio)
+{
+ zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+
+ zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+
+static void
+zio_vdev_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t align;
+
+ if (vd == NULL) {
+ /* The mirror_ops handle multiple DVAs in a single BP */
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return;
+ }
+
+ align = 1ULL << tvd->vdev_ashift;
+
+ if (zio->io_retries == 0 && vd == tvd)
+ zio->io_flags |= ZIO_FLAG_FAILFAST;
+
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ vd->vdev_children == 0) {
+ zio->io_flags |= ZIO_FLAG_PHYSICAL;
+ zio->io_offset += VDEV_LABEL_START_SIZE;
+ }
+
+ if (P2PHASE(zio->io_size, align) != 0) {
+ uint64_t asize = P2ROUNDUP(zio->io_size, align);
+ char *abuf = zio_buf_alloc(asize);
+ ASSERT(vd == tvd);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ bcopy(zio->io_data, abuf, zio->io_size);
+ bzero(abuf + zio->io_size, asize - zio->io_size);
+ }
+ zio_push_transform(zio, abuf, asize, asize);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
+ zio->io_flags |= ZIO_FLAG_SUBBLOCK;
+ }
+
+ ASSERT(P2PHASE(zio->io_offset, align) == 0);
+ ASSERT(P2PHASE(zio->io_size, align) == 0);
+ ASSERT(bp == NULL ||
+ P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
+ ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+
+ vdev_io_start(zio);
+
+ /* zio_next_stage_async() gets called from io completion interrupt */
+}
+
+static void
+zio_vdev_io_done(zio_t *zio)
+{
+ if (zio->io_vd == NULL)
+ /* The mirror_ops handle multiple DVAs in a single BP */
+ vdev_mirror_ops.vdev_op_io_done(zio);
+ else
+ vdev_io_done(zio);
+}
+
+/* XXPOLICY */
+boolean_t
+zio_should_retry(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio->io_error == 0)
+ return (B_FALSE);
+ if (zio->io_delegate_list != NULL)
+ return (B_FALSE);
+ if (vd && vd != vd->vdev_top)
+ return (B_FALSE);
+ if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
+ return (B_FALSE);
+ if (zio->io_retries > 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static void
+zio_vdev_io_assess(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+
+ ASSERT(zio->io_vsd == NULL);
+
+ if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
+ void *abuf;
+ uint64_t asize;
+ ASSERT(vd == tvd);
+ zio_pop_transform(zio, &abuf, &asize, &asize);
+ if (zio->io_type == ZIO_TYPE_READ)
+ bcopy(abuf, zio->io_data, zio->io_size);
+ zio_buf_free(abuf, asize);
+ zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
+ }
+
+ if (zio_injection_enabled && !zio->io_error)
+ zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+ /*
+ * If the I/O failed, determine whether we should attempt to retry it.
+ */
+ /* XXPOLICY */
+ if (zio_should_retry(zio)) {
+ ASSERT(tvd == vd);
+
+ zio->io_retries++;
+ zio->io_error = 0;
+ zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
+ ZIO_FLAG_CONFIG_GRABBED;
+ /* XXPOLICY */
+ zio->io_flags &= ~ZIO_FLAG_FAILFAST;
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
+
+ dprintf("retry #%d for %s to %s offset %llx\n",
+ zio->io_retries, zio_type_name[zio->io_type],
+ vdev_description(vd), zio->io_offset);
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_error != 0 && zio->io_error != ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
+ /*
+ * Poor man's hotplug support. Even if we're done retrying this
+ * I/O, try to reopen the vdev to see if it's still attached.
+ * To avoid excessive thrashing, we only try it once a minute.
+ * This also has the effect of detecting when missing devices
+ * have come back, by polling the device once a minute.
+ *
+ * We need to do this asynchronously because we can't grab
+ * all the necessary locks way down here.
+ */
+ if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
+ vd->vdev_last_try = gethrtime();
+ tvd->vdev_reopen_wanted = 1;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
+ }
+ }
+
+ zio_next_stage(zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static void
+zio_checksum_generate(zio_t *zio)
+{
+ int checksum = zio->io_checksum;
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_gang_checksum_generate(zio_t *zio)
+{
+ zio_cksum_t zc;
+ zio_gbh_phys_t *gbh = zio->io_data;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+
+ zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_checksum_verify(zio_t *zio)
+{
+ if (zio->io_bp != NULL) {
+ zio->io_error = zio_checksum_error(zio);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, zio->io_vd, zio, 0, 0);
+ }
+
+ zio_next_stage(zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+}
+
+/*
+ * Set the external verifier for a gang block based on stuff in the bp
+ */
+void
+zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
+ zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
+ zcp->zc_word[2] = bp->blk_birth;
+ zcp->zc_word[3] = 0;
+}
+
+/*
+ * ==========================================================================
+ * Define the pipeline
+ * ==========================================================================
+ */
+typedef void zio_pipe_stage_t(zio_t *zio);
+
+static void
+zio_badop(zio_t *zio)
+{
+ panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
+}
+
+zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
+ zio_badop,
+ zio_wait_children_ready,
+ zio_write_compress,
+ zio_checksum_generate,
+ zio_gang_pipeline,
+ zio_get_gang_header,
+ zio_rewrite_gang_members,
+ zio_free_gang_members,
+ zio_claim_gang_members,
+ zio_dva_allocate,
+ zio_dva_free,
+ zio_dva_claim,
+ zio_gang_checksum_generate,
+ zio_ready,
+ zio_vdev_io_start,
+ zio_vdev_io_done,
+ zio_vdev_io_assess,
+ zio_wait_children_done,
+ zio_checksum_verify,
+ zio_read_gang_members,
+ zio_read_decompress,
+ zio_done,
+ zio_badop
+};
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+void
+zio_next_stage(zio_t *zio)
+{
+ uint32_t pipeline = zio->io_pipeline;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ if (zio->io_error) {
+ dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+ zio, vdev_description(zio->io_vd),
+ zio->io_offset, zio->io_stage, zio->io_error);
+ if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+ }
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ /*
+ * See the comment in zio_next_stage_async() about per-CPU taskqs.
+ */
+ if (((1U << zio->io_stage) & zio->io_async_stages) &&
+ (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
+ !(zio->io_flags & ZIO_FLAG_METADATA)) {
+ taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+ (void) taskq_dispatch(tq,
+ (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+ } else {
+ zio_pipeline[zio->io_stage](zio);
+ }
+}
+
+void
+zio_next_stage_async(zio_t *zio)
+{
+ taskq_t *tq;
+ uint32_t pipeline = zio->io_pipeline;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ if (zio->io_error) {
+ dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+ zio, vdev_description(zio->io_vd),
+ zio->io_offset, zio->io_stage, zio->io_error);
+ if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+ }
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ /*
+ * For performance, we'll probably want two sets of task queues:
+ * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
+ * part is for read performance: since we have to make a pass over
+ * the data to checksum it anyway, we want to do this on the same CPU
+ * that issued the read, because (assuming CPU scheduling affinity)
+ * that thread is probably still there. Getting this optimization
+ * right avoids performance-hostile cache-to-cache transfers.
+ *
+ * Note that having two sets of task queues is also necessary for
+ * correctness: if all of the issue threads get bogged down waiting
+ * for dependent reads (e.g. metaslab freelist) to complete, then
+ * there won't be any threads available to service I/O completion
+ * interrupts.
+ */
+ if ((1U << zio->io_stage) & zio->io_async_stages) {
+ if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
+ tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+ else
+ tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
+ (void) taskq_dispatch(tq,
+ (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+ } else {
+ zio_pipeline[zio->io_stage](zio);
+ }
+}
+
+static boolean_t
+zio_alloc_should_fail(void)
+{
+ static uint16_t allocs = 0;
+
+ return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
+}
+
+/*
+ * Try to allocate an intent log block. Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t txg)
+{
+ int error;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ if (zio_zil_fail_shift && zio_alloc_should_fail()) {
+ spa_config_exit(spa, FTAG);
+ return (ENOSPC);
+ }
+
+ /*
+ * We were passed the previous log blocks dva_t in bp->blk_dva[0].
+ */
+ error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
+
+ if (error == 0) {
+ BP_SET_LSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(new_bp, 0);
+ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+ new_bp->blk_birth = txg;
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Free an intent log block. We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(!BP_IS_GANG(bp));
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ metaslab_free(spa, bp, txg, B_FALSE);
+
+ spa_config_exit(spa, FTAG);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
new file mode 100644
index 000000000000..f0d9a1463580
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -0,0 +1,172 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed. We support checksum vectors
+ * for three distinct reasons:
+ *
+ * 1. Different kinds of data need different levels of protection.
+ * For SPA metadata, we always want a very strong checksum.
+ * For user data, we let users make the trade-off between speed
+ * and checksum strength.
+ *
+ * 2. Cryptographic hash and MAC algorithms are an area of active research.
+ * It is likely that in future hash functions will be at least as strong
+ * as current best-of-breed, and may be substantially faster as well.
+ * We want the ability to take advantage of these new hashes as soon as
+ * they become available.
+ *
+ * 3. If someone develops hardware that can compute a strong hash quickly,
+ * we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in five bits of the DVA.
+ * This gives us room for up to 32 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength. When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified in the DVA encoding.
+ */
+
+/*ARGSUSED*/
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ {{NULL, NULL}, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
+};
+
+uint8_t
+zio_checksum_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (ZIO_CHECKSUM_ON_VALUE);
+
+ return (child);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+{
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t zbt_cksum;
+
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(ci->ci_func[0] != NULL);
+
+ if (ci->ci_zbt) {
+ *zcp = zbt->zbt_cksum;
+ zbt->zbt_magic = ZBT_MAGIC;
+ ci->ci_func[0](data, size, &zbt_cksum);
+ zbt->zbt_cksum = zbt_cksum;
+ } else {
+ ci->ci_func[0](data, size, zcp);
+ }
+}
+
+int
+zio_checksum_error(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ zio_cksum_t zc = bp->blk_cksum;
+ uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
+ BP_GET_CHECKSUM(bp);
+ int byteswap = BP_SHOULD_BYTESWAP(bp);
+ void *data = zio->io_data;
+ uint64_t size = ZIO_GET_IOSIZE(zio);
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (EINVAL);
+
+ if (ci->ci_zbt) {
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_set_gang_verifier(zio, &zc);
+
+ if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+ expected_cksum = zbt->zbt_cksum;
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
+ zbt->zbt_cksum = zc;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ ci->ci_func[1](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ } else {
+ expected_cksum = zbt->zbt_cksum;
+ zbt->zbt_cksum = zc;
+ ci->ci_func[0](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ }
+ zc = expected_cksum;
+ } else {
+ ASSERT(!BP_IS_GANG(bp));
+ ci->ci_func[byteswap](data, size, &actual_cksum);
+ }
+
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
+ return (ECKSUM);
+
+ if (zio_injection_enabled && !zio->io_error)
+ return (zio_handle_fault_injection(zio, ECKSUM));
+
+ return (0);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
new file mode 100644
index 000000000000..c563be4eb955
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -0,0 +1,148 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Compression vectors.
+ */
+
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ {NULL, NULL, 0, "inherit"},
+ {NULL, NULL, 0, "on"},
+ {NULL, NULL, 0, "uncompressed"},
+ {lzjb_compress, lzjb_decompress, 0, "lzjb"},
+ {NULL, NULL, 0, "empty"},
+ {gzip_compress, gzip_decompress, 1, "gzip-1"},
+ {gzip_compress, gzip_decompress, 2, "gzip-2"},
+ {gzip_compress, gzip_decompress, 3, "gzip-3"},
+ {gzip_compress, gzip_decompress, 4, "gzip-4"},
+ {gzip_compress, gzip_decompress, 5, "gzip-5"},
+ {gzip_compress, gzip_decompress, 6, "gzip-6"},
+ {gzip_compress, gzip_decompress, 7, "gzip-7"},
+ {gzip_compress, gzip_decompress, 8, "gzip-8"},
+ {gzip_compress, gzip_decompress, 9, "gzip-9"},
+};
+
+uint8_t
+zio_compress_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+
+ if (child == ZIO_COMPRESS_INHERIT)
+ return (parent);
+
+ if (child == ZIO_COMPRESS_ON)
+ return (ZIO_COMPRESS_ON_VALUE);
+
+ return (child);
+}
+
+int
+zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
+ uint64_t *destsizep, uint64_t *destbufsizep)
+{
+ uint64_t *word, *word_end;
+ uint64_t ciosize, gapsize, destbufsize;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ char *dest;
+ uint_t allzero;
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+
+ /*
+ * If the data is all zeroes, we don't even need to allocate
+ * a block for it. We indicate this by setting *destsizep = 0.
+ */
+ allzero = 1;
+ word = src;
+ word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
+ while (word < word_end) {
+ if (*word++ != 0) {
+ allzero = 0;
+ break;
+ }
+ }
+ if (allzero) {
+ *destp = NULL;
+ *destsizep = 0;
+ *destbufsizep = 0;
+ return (1);
+ }
+
+ if (cpfunc == ZIO_COMPRESS_EMPTY)
+ return (0);
+
+ /* Compress at least 12.5% */
+ destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
+ if (destbufsize == 0)
+ return (0);
+ dest = zio_buf_alloc(destbufsize);
+
+ ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
+ (size_t)destbufsize, ci->ci_level);
+ if (ciosize > destbufsize) {
+ zio_buf_free(dest, destbufsize);
+ return (0);
+ }
+
+ /* Cool. We compressed at least as much as we were hoping to. */
+
+ /* For security, make sure we don't write random heap crap to disk */
+ gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
+ if (gapsize != 0) {
+ bzero(dest + ciosize, gapsize);
+ ciosize += gapsize;
+ }
+
+ ASSERT3U(ciosize, <=, destbufsize);
+ ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
+ *destp = dest;
+ *destsizep = ciosize;
+ *destbufsizep = destbufsize;
+
+ return (1);
+}
+
+int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize)
+{
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+
+ return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
new file mode 100644
index 000000000000..4cada09d835c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault. These are kept in a global list. Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field. If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+uint32_t zio_injection_enabled;
+
+typedef struct inject_handler {
+ int zi_id;
+ spa_t *zi_spa;
+ zinject_record_t zi_record;
+ list_node_t zi_link;
+} inject_handler_t;
+
+static list_t inject_handlers;
+static krwlock_t inject_lock;
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_t *zb, uint64_t type,
+ zinject_record_t *record, int error)
+{
+ /*
+ * Check for a match against the MOS, which is based on type
+ */
+ if (zb->zb_objset == 0 && record->zi_objset == 0 &&
+ record->zi_object == 0) {
+ if (record->zi_type == DMU_OT_NONE ||
+ type == record->zi_type)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+ else
+ return (B_FALSE);
+ }
+
+ /*
+ * Check for an exact match.
+ */
+ if (zb->zb_objset == record->zi_objset &&
+ zb->zb_object == record->zi_object &&
+ zb->zb_level == record->zi_level &&
+ zb->zb_blkid >= record->zi_start &&
+ zb->zb_blkid <= record->zi_end &&
+ error == record->zi_error)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the I/O in question should return failure. Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ /*
+ * Currently, we only support fault injection on reads.
+ */
+ if (zio->io_type != ZIO_TYPE_READ)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa)
+ continue;
+
+ /* Ignore device errors */
+ if (handler->zi_record.zi_guid != 0)
+ continue;
+
+ /* If this handler matches, return EIO */
+ if (zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, int error)
+{
+ inject_handler_t *handler;
+ int ret = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_error == error) {
+ /*
+ * For a failed open, pretend like the device
+ * has gone away.
+ */
+ if (error == ENXIO)
+ vd->vdev_stat.vs_aux =
+ VDEV_AUX_OPEN_FAILED;
+ ret = error;
+ break;
+ }
+ if (handler->zi_record.zi_error == ENXIO) {
+ ret = EIO;
+ break;
+ }
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Create a new handler for the given record. We add it to the list, adding
+ * a reference to the spa_t in the process. We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int error;
+ spa_t *spa;
+
+ /*
+ * If this is pool-wide metadata, make sure we unload the corresponding
+ * spa_t, so that the next attempt to load it will trigger the fault.
+ * We call spa_reset() to unload the pool appropriately.
+ */
+ if (flags & ZINJECT_UNLOAD_SPA)
+ if ((error = spa_reset(name)) != 0)
+ return (error);
+
+ if (!(flags & ZINJECT_NULL)) {
+ /*
+ * spa_inject_ref() will add an injection reference, which will
+ * prevent the pool from being removed from the namespace while
+ * still allowing it to be unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (ENOENT);
+
+ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ *id = handler->zi_id = inject_next_id++;
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+ list_insert_tail(&inject_handlers, handler);
+ atomic_add_32(&zio_injection_enabled, 1);
+
+ rw_exit(&inject_lock);
+ }
+
+ /*
+ * Flush the ARC, so that any attempts to read this data will end up
+ * going to the ZIO layer. Note that this is a little overkill, but
+ * we don't have the necessary ARC interfaces to do anything else, and
+ * fault injection isn't a performance critical path.
+ */
+ if (flags & ZINJECT_FLUSH_ARC)
+ arc_flush();
+
+ return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function. Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+ zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ mutex_enter(&spa_namespace_lock);
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id > *id)
+ break;
+
+ if (handler) {
+ *record = handler->zi_record;
+ *id = handler->zi_id;
+ (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ rw_exit(&inject_lock);
+ mutex_exit(&spa_namespace_lock);
+
+ return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id == id)
+ break;
+
+ if (handler == NULL) {
+ ret = ENOENT;
+ } else {
+ list_remove(&inject_handlers, handler);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+ ret = 0;
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+void
+zio_inject_init(void)
+{
+ list_create(&inject_handlers, sizeof (inject_handler_t),
+ offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+ list_destroy(&inject_handlers);
+}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c
new file mode 100644
index 000000000000..9aa8b7f31b36
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/dsk/<pool_name>/<dataset_name>
+ * /dev/zvol/rdsk/<pool_name>/<dataset_name>
+ *
+ * These links are created by the ZFS-specific devfsadm link generator.
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dsl_prop.h>
+#include <sys/byteorder.h>
+#include <sys/dirent.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/refcount.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <geom/geom.h>
+
+#include "zfs_namecheck.h"
+
+struct g_class zfs_zvol_class = {
+ .name = "ZFS::ZVOL",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+#define ZVOL_OBJ 1ULL
+#define ZVOL_ZAP_OBJ 2ULL
+
+static uint32_t zvol_minors;
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+ char zv_name[MAXPATHLEN]; /* pool/dd name */
+ uint64_t zv_volsize; /* amount of space we advertise */
+ uint64_t zv_volblocksize; /* volume block size */
+ struct g_provider *zv_provider; /* GEOM provider */
+ uint8_t zv_min_bs; /* minimum addressable block shift */
+ uint8_t zv_readonly; /* hard readonly; like write-protect */
+ objset_t *zv_objset; /* objset handle */
+ uint32_t zv_mode; /* DS_MODE_* flags at open time */
+ uint32_t zv_total_opens; /* total open count */
+ zilog_t *zv_zilog; /* ZIL handle */
+ uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
+ znode_t zv_znode; /* for range locking */
+ int zv_state;
+ struct bio_queue_head zv_queue;
+ struct mtx zv_queue_mtx; /* zv_queue mutex */
+} zvol_state_t;
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS/2;
+
+static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
+
+int
+zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
+{
+ if (volsize == 0)
+ return (EINVAL);
+
+ if (volsize % blocksize != 0)
+ return (EINVAL);
+
+#ifdef _ILP32
+ if (volsize - 1 > SPEC_MAXOFFSET_T)
+ return (EOVERFLOW);
+#endif
+ return (0);
+}
+
+int
+zvol_check_volblocksize(uint64_t volblocksize)
+{
+ if (volblocksize < SPA_MINBLOCKSIZE ||
+ volblocksize > SPA_MAXBLOCKSIZE ||
+ !ISP2(volblocksize))
+ return (EDOM);
+
+ return (0);
+}
+
+static void
+zvol_readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zvol_state_t *zv = arg;
+
+ zv->zv_readonly = (uint8_t)newval;
+}
+
+int
+zvol_get_stats(objset_t *os, nvlist_t *nv)
+{
+ int error;
+ dmu_object_info_t doi;
+ uint64_t val;
+
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
+ if (error)
+ return (error);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
+
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+
+ if (error == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
+ doi.doi_data_block_size);
+ }
+
+ return (error);
+}
+
+static zvol_state_t *
+zvol_minor_lookup(const char *name)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
+ return (pp->private);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+zvol_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ zvol_state_t *zv;
+
+ g_topology_assert();
+
+ zv = pp->private;
+ if (zv == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ ASSERT(zv->zv_objset != NULL);
+
+ if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)))
+ return (EROFS);
+
+ zv->zv_total_opens += acr + acw + ace;
+
+ return (0);
+}
+
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+{
+ uint32_t blocksize = zv->zv_volblocksize;
+ lr_write_t *lr;
+
+ while (len) {
+ ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
+ itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+
+ itx->itx_wr_state =
+ len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
+ itx->itx_private = zv;
+ lr = (lr_write_t *)&itx->itx_lr;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = nbytes;
+ lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
+ BP_ZERO(&lr->lr_blkptr);
+
+ (void) zil_itx_assign(zv->zv_zilog, itx, tx);
+ len -= nbytes;
+ off += nbytes;
+ }
+}
+
+static void
+zvol_start(struct bio *bp)
+{
+ zvol_state_t *zv;
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_FLUSH:
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+ mtx_lock(&zv->zv_queue_mtx);
+ bioq_insert_tail(&zv->zv_queue, bp);
+ wakeup_one(&zv->zv_queue);
+ mtx_unlock(&zv->zv_queue_mtx);
+ break;
+ case BIO_DELETE:
+ case BIO_GETATTR:
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+}
+
+static void
+zvol_serve_one(zvol_state_t *zv, struct bio *bp)
+{
+ uint64_t off, volsize;
+ size_t size, resid;
+ char *addr;
+ objset_t *os;
+ rl_t *rl;
+ int error = 0;
+ boolean_t reading;
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT(os != NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ error = 0;
+
+ /*
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
+ * A better approach than a per zvol rwlock would be to lock ranges.
+ */
+ reading = (bp->bio_cmd == BIO_READ);
+ rl = zfs_range_lock(&zv->zv_znode, off, resid,
+ reading ? RL_READER : RL_WRITER);
+
+ while (resid != 0 && off < volsize) {
+
+ size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
+
+ if (size > volsize - off) /* don't write past the end */
+ size = volsize - off;
+
+ if (reading) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error)
+ break;
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+ zfs_range_unlock(rl);
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length)
+ bp->bio_error = (off > volsize ? EINVAL : error);
+
+ /*
+ * XXX: We are devilering here?
+ * Looks like I don't understand something here, but I was sure it was
+ * an async request.
+ */
+ g_io_deliver(bp, bp->bio_error);
+}
+
+static void
+zvol_worker(void *arg)
+{
+ zvol_state_t *zv;
+ struct bio *bp;
+
+ zv = arg;
+ for (;;) {
+ mtx_lock(&zv->zv_queue_mtx);
+ bp = bioq_takefirst(&zv->zv_queue);
+ if (bp == NULL) {
+ if (zv->zv_state == 1) {
+ zv->zv_state = 2;
+ wakeup(&zv->zv_state);
+ mtx_unlock(&zv->zv_queue_mtx);
+ kthread_exit(0);
+ }
+ msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
+ "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zv->zv_queue_mtx);
+ if (bp->bio_cmd == BIO_FLUSH) {
+ zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ } else {
+ zvol_serve_one(zv, bp);
+ }
+ }
+}
+
+void
+zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ zfs_create_data_t *zc = arg;
+ int error;
+ uint64_t volblocksize, volsize;
+
+ VERIFY(nvlist_lookup_uint64(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+ if (nvlist_lookup_uint64(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
+ volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+ /*
+ * These properites must be removed from the list so the generic
+ * property setting step won't apply to them.
+ */
+ VERIFY(nvlist_remove_all(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+ (void) nvlist_remove_all(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
+
+ error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
+ ASSERT(error == 0);
+}
+
+/*
+ * Replay a TX_WRITE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zv->zv_objset;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ uint64_t off = lr->lr_offset;
+ uint64_t len = lr->lr_length;
+ dmu_tx_t *tx;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+ error = dmu_tx_assign(tx, zv->zv_txg_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+ dmu_tx_commit(tx);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
+{
+ return (ENOTSUP);
+}
+
+/*
+ * Callback vectors for replaying records.
+ * Only TX_WRITE is needed for zvol.
+ */
+zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+ zvol_replay_err, /* 0 no such transaction type */
+ zvol_replay_err, /* TX_CREATE */
+ zvol_replay_err, /* TX_MKDIR */
+ zvol_replay_err, /* TX_MKXATTR */
+ zvol_replay_err, /* TX_SYMLINK */
+ zvol_replay_err, /* TX_REMOVE */
+ zvol_replay_err, /* TX_RMDIR */
+ zvol_replay_err, /* TX_LINK */
+ zvol_replay_err, /* TX_RENAME */
+ zvol_replay_write, /* TX_WRITE */
+ zvol_replay_err, /* TX_TRUNCATE */
+ zvol_replay_err, /* TX_SETATTR */
+ zvol_replay_err, /* TX_ACL */
+};
+
+/*
+ * Create a minor node for the specified volume.
+ */
+int
+zvol_create_minor(const char *name, dev_t dev)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+ zvol_state_t *zv;
+ objset_t *os;
+ dmu_object_info_t doi;
+ uint64_t volsize;
+ int ds_mode = DS_MODE_PRIMARY;
+ int error;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) != NULL) {
+ error = EEXIST;
+ goto end;
+ }
+
+ if (strchr(name, '@') != 0)
+ ds_mode |= DS_MODE_READONLY;
+
+ error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
+ if (error)
+ goto end;
+
+ g_topology_unlock();
+ PICKUP_GIANT();
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ DROP_GIANT();
+ g_topology_lock();
+ if (error) {
+ dmu_objset_close(os);
+ goto end;
+ }
+
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_start;
+ gp->access = zvol_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
+ pp->mediasize = volsize;
+ pp->sectorsize = DEV_BSIZE;
+
+ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+ (void) strcpy(zv->zv_name, name);
+ zv->zv_min_bs = DEV_BSHIFT;
+ zv->zv_provider = pp;
+ zv->zv_volsize = pp->mediasize;
+ zv->zv_objset = os;
+ zv->zv_mode = ds_mode;
+ zv->zv_zilog = zil_open(os, zvol_get_data);
+ mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
+ sizeof (rl_t), offsetof(rl_t, r_node));
+
+
+ /* get and cache the blocksize */
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ ASSERT(error == 0);
+ zv->zv_volblocksize = doi.doi_data_block_size;
+
+ zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
+
+ /* XXX this should handle the possible i/o error */
+ VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
+ "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+ pp->private = zv;
+ g_error_provider(pp, 0);
+
+ bioq_init(&zv->zv_queue);
+ mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+ zv->zv_state = 0;
+ kthread_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name);
+
+ zvol_minors++;
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+int
+zvol_remove_minor(const char *name)
+{
+ struct g_provider *pp;
+ zvol_state_t *zv;
+ int error = 0;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto end;
+ }
+
+ if (zv->zv_total_opens != 0) {
+ error = EBUSY;
+ goto end;
+ }
+
+ VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
+ "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+ mtx_lock(&zv->zv_queue_mtx);
+ zv->zv_state = 1;
+ wakeup_one(&zv->zv_queue);
+ while (zv->zv_state != 2)
+ msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
+ mtx_unlock(&zv->zv_queue_mtx);
+ mtx_destroy(&zv->zv_queue_mtx);
+
+ pp = zv->zv_provider;
+ pp->private = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+
+ zil_close(zv->zv_zilog);
+ zv->zv_zilog = NULL;
+ dmu_objset_close(zv->zv_objset);
+ zv->zv_objset = NULL;
+ avl_destroy(&zv->zv_znode.z_range_avl);
+ mutex_destroy(&zv->zv_znode.z_range_lock);
+
+ kmem_free(zv, sizeof(*zv));
+
+ zvol_minors--;
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+int
+zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
+{
+ zvol_state_t *zv;
+ dmu_tx_t *tx;
+ int error;
+ dmu_object_info_t doi;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto end;
+ }
+
+ if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
+ (error = zvol_check_volsize(volsize,
+ doi.doi_data_block_size)) != 0) {
+ goto end;
+ }
+
+ if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+ error = EROFS;
+ goto end;
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto end;
+ }
+
+ error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+ &volsize, tx);
+ if (error == 0) {
+ error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
+ DMU_OBJECT_END, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (error == 0) {
+ zv->zv_volsize = volsize;
+ zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */
+ }
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+int
+zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+{
+ zvol_state_t *zv;
+ dmu_tx_t *tx;
+ int error;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto end;
+ }
+
+ if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+ error = EROFS;
+ goto end;
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+ volblocksize, 0, tx);
+ if (error == ENOTSUP)
+ error = EBUSY;
+ dmu_tx_commit(tx);
+ /* XXX: Not supported. */
+#if 0
+ if (error == 0)
+ zv->zv_provider->sectorsize = zc->zc_volblocksize;
+#endif
+ }
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+void
+zvol_get_done(dmu_buf_t *db, void *vzgd)
+{
+ zgd_t *zgd = (zgd_t *)vzgd;
+ rl_t *rl = zgd->zgd_rl;
+
+ dmu_buf_rele(db, vzgd);
+ zfs_range_unlock(rl);
+ zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+static int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ zvol_state_t *zv = arg;
+ objset_t *os = zv->zv_objset;
+ dmu_buf_t *db;
+ rl_t *rl;
+ zgd_t *zgd;
+ uint64_t boff; /* block starting offset */
+ int dlen = lr->lr_length; /* length of user data */
+ int error;
+
+ ASSERT(zio);
+ ASSERT(dlen != 0);
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) /* immediate write */
+ return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
+
+ zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zv->zv_zilog;
+ zgd->zgd_bp = &lr->lr_blkptr;
+
+ /*
+ * Lock the range of the block to ensure that when the data is
+ * written out and it's checksum is being calculated that no other
+ * thread can change the block.
+ */
+ boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
+ rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
+ RL_READER);
+ zgd->zgd_rl = rl;
+
+ VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+ error = dmu_sync(zio, db, &lr->lr_blkptr,
+ lr->lr_common.lrc_txg, zvol_get_done, zgd);
+ if (error == 0)
+ zil_add_vdev(zv->zv_zilog,
+ DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+ /*
+ * If we get EINPROGRESS, then we need to wait for a
+ * write IO initiated by dmu_sync() to complete before
+ * we can release this dbuf. We will finish everything
+ * up in the zvol_get_done() callback.
+ */
+ if (error == EINPROGRESS)
+ return (0);
+ dmu_buf_rele(db, zgd);
+ zfs_range_unlock(rl);
+ kmem_free(zgd, sizeof (zgd_t));
+ return (error);
+}
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+ ZFS_LOG(1, "ZVOL Initialized.");
+}
+
+void
+zvol_fini(void)
+{
+ ZFS_LOG(1, "ZVOL Deinitialized.");
+}
diff --git a/sys/contrib/opensolaris/uts/common/os/callb.c b/sys/contrib/opensolaris/uts/common/os/callb.c
new file mode 100644
index 000000000000..9bfae1307468
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/os/callb.c
@@ -0,0 +1,363 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/systm.h> /* for delay() */
+#include <sys/taskq.h> /* For TASKQ_NAMELEN */
+#include <sys/kernel.h>
+
+#define CB_MAXNAME TASKQ_NAMELEN
+
+/*
+ * The callb mechanism provides generic event scheduling/echoing.
+ * A callb function is registered and called on behalf of the event.
+ */
+typedef struct callb {
+ struct callb *c_next; /* next in class or on freelist */
+ kthread_id_t c_thread; /* ptr to caller's thread struct */
+ char c_flag; /* info about the callb state */
+ uchar_t c_class; /* this callb's class */
+ kcondvar_t c_done_cv; /* signal callb completion */
+ boolean_t (*c_func)(); /* cb function: returns true if ok */
+ void *c_arg; /* arg to c_func */
+ char c_name[CB_MAXNAME+1]; /* debug:max func name length */
+} callb_t;
+
+/*
+ * callb c_flag bitmap definitions
+ */
+#define CALLB_FREE 0x0
+#define CALLB_TAKEN 0x1
+#define CALLB_EXECUTING 0x2
+
+/*
+ * Basic structure for a callb table.
+ * All callbs are organized into different class groups described
+ * by ct_class array.
+ * The callbs within a class are single-linked and normally run by a
+ * serial execution.
+ */
+typedef struct callb_table {
+ kmutex_t ct_lock; /* protect all callb states */
+ callb_t *ct_freelist; /* free callb structures */
+ int ct_busy; /* != 0 prevents additions */
+ kcondvar_t ct_busy_cv; /* to wait for not busy */
+ int ct_ncallb; /* num of callbs allocated */
+ callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */
+} callb_table_t;
+
+int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
+
+static callb_id_t callb_add_common(boolean_t (*)(void *, int),
+ void *, int, char *, kthread_id_t);
+
+static callb_table_t callb_table; /* system level callback table */
+static callb_table_t *ct = &callb_table;
+static kmutex_t callb_safe_mutex;
+callb_cpr_t callb_cprinfo_safe = {
+ &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 };
+
+/*
+ * Init all callb tables in the system.
+ */
+void
+callb_init(void *dummy __unused)
+{
+ callb_table.ct_busy = 0; /* mark table open for additions */
+ mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+callb_fini(void *dummy __unused)
+{
+ callb_t *cp;
+
+ mutex_enter(&ct->ct_lock);
+ while ((cp = ct->ct_freelist) != NULL) {
+ ct->ct_freelist = cp->c_next;
+ ct->ct_ncallb--;
+ kmem_free(cp, sizeof (callb_t));
+ }
+ ASSERT(ct->ct_ncallb == 0);
+ mutex_exit(&ct->ct_lock);
+ mutex_destroy(&callb_safe_mutex);
+ mutex_destroy(&callb_table.ct_lock);
+}
+
+/*
+ * callout_add() is called to register func() be called later.
+ */
+static callb_id_t
+callb_add_common(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ callb_t *cp;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+ while (ct->ct_busy)
+ cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
+ if ((cp = ct->ct_freelist) == NULL) {
+ ct->ct_ncallb++;
+ cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
+ }
+ ct->ct_freelist = cp->c_next;
+ cp->c_thread = t;
+ cp->c_func = func;
+ cp->c_arg = arg;
+ cp->c_class = (uchar_t)class;
+ cp->c_flag |= CALLB_TAKEN;
+#ifdef DEBUG
+ if (strlen(name) > CB_MAXNAME)
+ cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
+ "too long -- truncated to %d chars",
+ name, CB_MAXNAME);
+#endif
+ (void) strncpy(cp->c_name, name, CB_MAXNAME);
+ cp->c_name[CB_MAXNAME] = '\0';
+
+ /*
+ * Insert the new callb at the head of its class list.
+ */
+ cp->c_next = ct->ct_first_cb[class];
+ ct->ct_first_cb[class] = cp;
+
+ mutex_exit(&ct->ct_lock);
+ return ((callb_id_t)cp);
+}
+
+/*
+ * The default function to add an entry to the callback table. Since
+ * it uses curthread as the thread identifier to store in the table,
+ * it should be used for the normal case of a thread which is calling
+ * to add ITSELF to the table.
+ */
+callb_id_t
+callb_add(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name)
+{
+ return (callb_add_common(func, arg, class, name, curthread));
+}
+
+/*
+ * A special version of callb_add() above for use by threads which
+ * might be adding an entry to the table on behalf of some other
+ * thread (for example, one which is constructed but not yet running).
+ * In this version the thread id is an argument.
+ */
+callb_id_t
+callb_add_thread(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ return (callb_add_common(func, arg, class, name, t));
+}
+
+/*
+ * callout_delete() is called to remove an entry identified by id
+ * that was originally placed there by a call to callout_add().
+ * return -1 if fail to delete a callb entry otherwise return 0.
+ */
+int
+callb_delete(callb_id_t id)
+{
+ callb_t **pp;
+ callb_t *me = (callb_t *)id;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (;;) {
+ pp = &ct->ct_first_cb[me->c_class];
+ while (*pp != NULL && *pp != me)
+ pp = &(*pp)->c_next;
+
+#ifdef DEBUG
+ if (*pp != me) {
+ cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
+ (void *)me);
+ mutex_exit(&ct->ct_lock);
+ return (-1);
+ }
+#endif /* DEBUG */
+
+ /*
+ * It is not allowed to delete a callb in the middle of
+ * executing otherwise, the callb_execute() will be confused.
+ */
+ if (!(me->c_flag & CALLB_EXECUTING))
+ break;
+
+ cv_wait(&me->c_done_cv, &ct->ct_lock);
+ }
+ /* relink the class list */
+ *pp = me->c_next;
+
+ /* clean up myself and return the free callb to the head of freelist */
+ me->c_flag = CALLB_FREE;
+ me->c_next = ct->ct_freelist;
+ ct->ct_freelist = me;
+
+ mutex_exit(&ct->ct_lock);
+ return (0);
+}
+
+/*
+ * class: indicates to execute all callbs in the same class;
+ * code: optional argument for the callb functions.
+ * return: = 0: success
+ * != 0: ptr to string supplied when callback was registered
+ */
+void *
+callb_execute_class(int class, int code)
+{
+ callb_t *cp;
+ void *ret = NULL;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[class];
+ cp != NULL && ret == 0; cp = cp->c_next) {
+ while (cp->c_flag & CALLB_EXECUTING)
+ cv_wait(&cp->c_done_cv, &ct->ct_lock);
+ /*
+ * cont if the callb is deleted while we're sleeping
+ */
+ if (cp->c_flag == CALLB_FREE)
+ continue;
+ cp->c_flag |= CALLB_EXECUTING;
+
+#ifdef CALLB_DEBUG
+ printf("callb_execute: name=%s func=%p arg=%p\n",
+ cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
+#endif /* CALLB_DEBUG */
+
+ mutex_exit(&ct->ct_lock);
+ /* If callback function fails, pass back client's name */
+ if (!(*cp->c_func)(cp->c_arg, code))
+ ret = cp->c_name;
+ mutex_enter(&ct->ct_lock);
+
+ cp->c_flag &= ~CALLB_EXECUTING;
+ cv_broadcast(&cp->c_done_cv);
+ }
+ mutex_exit(&ct->ct_lock);
+ return (ret);
+}
+
+/*
+ * callers make sure no recursive entries to this func.
+ * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
+ *
+ * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
+ * use a cv_timedwait() in case the kernel thread is blocked.
+ *
+ * Note that this is a generic callback handler for daemon CPR and
+ * should NOT be changed to accommodate any specific requirement in a daemon.
+ * Individual daemons that require changes to the handler shall write
+ * callback routines in their own daemon modules.
+ */
+boolean_t
+callb_generic_cpr(void *arg, int code)
+{
+ callb_cpr_t *cp = (callb_cpr_t *)arg;
+ clock_t ret = 0; /* assume success */
+
+ mutex_enter(cp->cc_lockp);
+
+ switch (code) {
+ case CB_CODE_CPR_CHKPT:
+ cp->cc_events |= CALLB_CPR_START;
+ while (!(cp->cc_events & CALLB_CPR_SAFE))
+ /* cv_timedwait() returns -1 if it times out. */
+ if ((ret = cv_timedwait(&cp->cc_callb_cv,
+ cp->cc_lockp,
+ callb_timeout_sec * hz)) == -1)
+ break;
+ break;
+
+ case CB_CODE_CPR_RESUME:
+ cp->cc_events &= ~CALLB_CPR_START;
+ cv_signal(&cp->cc_stop_cv);
+ break;
+ }
+ mutex_exit(cp->cc_lockp);
+ return (ret != -1);
+}
+
+/*
+ * The generic callback function associated with kernel threads which
+ * are always considered safe.
+ */
+/* ARGSUSED */
+boolean_t
+callb_generic_cpr_safe(void *arg, int code)
+{
+ return (B_TRUE);
+}
+/*
+ * Prevent additions to callback table.
+ */
+void
+callb_lock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy == 0);
+ ct->ct_busy = 1;
+ mutex_exit(&ct->ct_lock);
+}
+
+/*
+ * Allow additions to callback table.
+ */
+void
+callb_unlock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy != 0);
+ ct->ct_busy = 0;
+ cv_broadcast(&ct->ct_busy_cv);
+ mutex_exit(&ct->ct_lock);
+}
+
+SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL)
+SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/contrib/opensolaris/uts/common/os/list.c b/sys/contrib/opensolaris/uts/common/os/list.c
new file mode 100644
index 000000000000..f9b6fcb230b8
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/os/list.c
@@ -0,0 +1,193 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Generic doubly-linked list implementation
+ */
+
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
+
+#define list_insert_after_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_prev = node; \
+ lnew->list_next = node->list_next; \
+ node->list_next->list_prev = lnew; \
+ node->list_next = lnew; \
+}
+
+#define list_insert_before_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_next = node; \
+ lnew->list_prev = node->list_prev; \
+ node->list_prev->list_next = lnew; \
+ node->list_prev = lnew; \
+}
+
+void
+list_create(list_t *list, size_t size, size_t offset)
+{
+ ASSERT(list);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (list_node_t));
+
+ list->list_size = size;
+ list->list_offset = offset;
+ list->list_head.list_next = list->list_head.list_prev =
+ &list->list_head;
+}
+
+void
+list_destroy(list_t *list)
+{
+ list_node_t *node = &list->list_head;
+
+ ASSERT(list);
+ ASSERT(list->list_head.list_next == node);
+ ASSERT(list->list_head.list_prev == node);
+
+ node->list_next = node->list_prev = NULL;
+}
+
+void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_after_node(list, lold, nobject);
+}
+
+void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_before_node(list, lold, nobject)
+}
+
+void
+list_insert_head(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_after_node(list, lold, object);
+}
+
+void
+list_insert_tail(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_before_node(list, lold, object);
+}
+
+void
+list_remove(list_t *list, void *object)
+{
+ list_node_t *lold = list_d2l(list, object);
+ ASSERT(!list_empty(list));
+ lold->list_prev->list_next = lold->list_next;
+ lold->list_next->list_prev = lold->list_prev;
+ lold->list_next = lold->list_prev = NULL;
+}
+
+void *
+list_head(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_next));
+}
+
+void *
+list_tail(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_prev));
+}
+
+void *
+list_next(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_next != &list->list_head)
+ return (list_object(list, node->list_next));
+
+ return (NULL);
+}
+
+void *
+list_prev(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_prev != &list->list_head)
+ return (list_object(list, node->list_prev));
+
+ return (NULL);
+}
+
+/*
+ * Insert src list after dst list. Empty src list thereafter.
+ */
+void
+list_move_tail(list_t *dst, list_t *src)
+{
+ list_node_t *dstnode = &dst->list_head;
+ list_node_t *srcnode = &src->list_head;
+
+ ASSERT(dst->list_size == src->list_size);
+ ASSERT(dst->list_offset == src->list_offset);
+
+ if (list_empty(src))
+ return;
+
+ dstnode->list_prev->list_next = srcnode->list_next;
+ srcnode->list_next->list_prev = dstnode->list_prev;
+ dstnode->list_prev = srcnode->list_prev;
+ srcnode->list_prev->list_next = dstnode;
+
+ /* empty src list */
+ srcnode->list_next = srcnode->list_prev = srcnode;
+}
+
+int
+list_link_active(list_node_t *link)
+{
+ return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+ return (list_empty(list));
+}
diff --git a/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c b/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
new file mode 100644
index 000000000000..3682853de902
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/nvpair.h>
+
+static void *
+nv_alloc_sys(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg));
+}
+
+/*ARGSUSED*/
+static void
+nv_free_sys(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+static const nv_alloc_ops_t system_ops = {
+ NULL, /* nv_ao_init() */
+ NULL, /* nv_ao_fini() */
+ nv_alloc_sys, /* nv_ao_alloc() */
+ nv_free_sys, /* nv_ao_free() */
+ NULL /* nv_ao_reset() */
+};
+
+nv_alloc_t nv_alloc_sleep_def = {
+ &system_ops,
+ (void *)KM_SLEEP
+};
+
+nv_alloc_t nv_alloc_nosleep_def = {
+ &system_ops,
+ (void *)KM_NOSLEEP
+};
+
+nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
+nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
diff --git a/sys/contrib/opensolaris/uts/common/os/taskq.c b/sys/contrib/opensolaris/uts/common/os/taskq.c
new file mode 100644
index 000000000000..efaea2ec450c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/os/taskq.c
@@ -0,0 +1,1020 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Kernel task queues: general-purpose asynchronous task scheduling.
+ *
+ * A common problem in kernel programming is the need to schedule tasks
+ * to be performed later, by another thread. There are several reasons
+ * you may want or need to do this:
+ *
+ * (1) The task isn't time-critical, but your current code path is.
+ *
+ * (2) The task may require grabbing locks that you already hold.
+ *
+ * (3) The task may need to block (e.g. to wait for memory), but you
+ * cannot block in your current context.
+ *
+ * (4) Your code path can't complete because of some condition, but you can't
+ * sleep or fail, so you queue the task for later execution when condition
+ * disappears.
+ *
+ * (5) You just want a simple way to launch multiple tasks in parallel.
+ *
+ * Task queues provide such a facility. In its simplest form (used when
+ * performance is not a critical consideration) a task queue consists of a
+ * single list of tasks, together with one or more threads to service the
+ * list. There are some cases when this simple queue is not sufficient:
+ *
+ * (1) The task queues are very hot and there is a need to avoid data and lock
+ * contention over global resources.
+ *
+ * (2) Some tasks may depend on other tasks to complete, so they can't be put in
+ * the same list managed by the same thread.
+ *
+ * (3) Some tasks may block for a long time, and this should not block other
+ * tasks in the queue.
+ *
+ * To provide useful service in such cases we define a "dynamic task queue"
+ * which has an individual thread for each of the tasks. These threads are
+ * dynamically created as they are needed and destroyed when they are not in
+ * use. The API for managing task pools is the same as for managing task queues
+ * with the exception of a taskq creation flag TASKQ_DYNAMIC which tells that
+ * dynamic task pool behavior is desired.
+ *
+ * Dynamic task queues may also place tasks in the normal queue (called "backing
+ * queue") when task pool runs out of resources. Users of task queues may
+ * disallow such queued scheduling by specifying TQ_NOQUEUE in the dispatch
+ * flags.
+ *
+ * The backing task queue is also used for scheduling internal tasks needed for
+ * dynamic task queue maintenance.
+ *
+ * INTERFACES:
+ *
+ * taskq_t *taskq_create(name, nthreads, pri_t pri, minalloc, maxall, flags);
+ *
+ * Create a taskq with specified properties.
+ * Possible 'flags':
+ *
+ * TASKQ_DYNAMIC: Create task pool for task management. If this flag is
+ * specified, 'nthreads' specifies the maximum number of threads in
+ * the task queue. Task execution order for dynamic task queues is
+ * not predictable.
+ *
+ * If this flag is not specified (default case) a
+ * single-list task queue is created with 'nthreads' threads
+ * servicing it. Entries in this queue are managed by
+ * taskq_ent_alloc() and taskq_ent_free() which try to keep the
+ * task population between 'minalloc' and 'maxalloc', but the
+ * latter limit is only advisory for TQ_SLEEP dispatches and the
+ * former limit is only advisory for TQ_NOALLOC dispatches. If
+ * TASKQ_PREPOPULATE is set in 'flags', the taskq will be
+ * prepopulated with 'minalloc' task structures.
+ *
+ * Since non-DYNAMIC taskqs are queues, tasks are guaranteed to be
+ * executed in the order they are scheduled if nthreads == 1.
+ * If nthreads > 1, task execution order is not predictable.
+ *
+ * TASKQ_PREPOPULATE: Prepopulate task queue with threads.
+ * Also prepopulate the task queue with 'minalloc' task structures.
+ *
+ * TASKQ_CPR_SAFE: This flag specifies that users of the task queue will
+ * use their own protocol for handling CPR issues. This flag is not
+ * supported for DYNAMIC task queues.
+ *
+ * The 'pri' field specifies the default priority for the threads that
+ * service all scheduled tasks.
+ *
+ * void taskq_destroy(tap):
+ *
+ * Waits for any scheduled tasks to complete, then destroys the taskq.
+ * Caller should guarantee that no new tasks are scheduled in the closing
+ * taskq.
+ *
+ * taskqid_t taskq_dispatch(tq, func, arg, flags):
+ *
+ * Dispatches the task "func(arg)" to taskq. The 'flags' indicates whether
+ * the caller is willing to block for memory. The function returns an
+ * opaque value which is zero iff dispatch fails. If flags is TQ_NOSLEEP
+ * or TQ_NOALLOC and the task can't be dispatched, taskq_dispatch() fails
+ * and returns (taskqid_t)0.
+ *
+ * ASSUMES: func != NULL.
+ *
+ * Possible flags:
+ * TQ_NOSLEEP: Do not wait for resources; may fail.
+ *
+ * TQ_NOALLOC: Do not allocate memory; may fail. May only be used with
+ * non-dynamic task queues.
+ *
+ * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to
+ * lack of available resources and fail. If this flag is not
+ * set, and the task pool is exhausted, the task may be scheduled
+ * in the backing queue. This flag may ONLY be used with dynamic
+ * task queues.
+ *
+ * NOTE: This flag should always be used when a task queue is used
+ * for tasks that may depend on each other for completion.
+ * Enqueueing dependent tasks may create deadlocks.
+ *
+ * TQ_SLEEP: May block waiting for resources. May still fail for
+ * dynamic task queues if TQ_NOQUEUE is also specified, otherwise
+ * always succeed.
+ *
+ * NOTE: Dynamic task queues are much more likely to fail in
+ * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it
+ * is important to have backup strategies handling such failures.
+ *
+ * void taskq_wait(tq):
+ *
+ * Waits for all previously scheduled tasks to complete.
+ *
+ * NOTE: It does not stop any new task dispatches.
+ * Do NOT call taskq_wait() from a task: it will cause deadlock.
+ *
+ * void taskq_suspend(tq)
+ *
+ * Suspend all task execution. Tasks already scheduled for a dynamic task
+ * queue will still be executed, but all new scheduled tasks will be
+ * suspended until taskq_resume() is called.
+ *
+ * int taskq_suspended(tq)
+ *
+ * Returns 1 if taskq is suspended and 0 otherwise. It is intended to
+ * ASSERT that the task queue is suspended.
+ *
+ * void taskq_resume(tq)
+ *
+ * Resume task queue execution.
+ *
+ * int taskq_member(tq, thread)
+ *
+ * Returns 1 if 'thread' belongs to taskq 'tq' and 0 otherwise. The
+ * intended use is to ASSERT that a given function is called in taskq
+ * context only.
+ *
+ * system_taskq
+ *
+ * Global system-wide dynamic task queue for common uses. It may be used by
+ * any subsystem that needs to schedule tasks and does not need to manage
+ * its own task queues. It is initialized quite early during system boot.
+ *
+ * IMPLEMENTATION.
+ *
+ * This is schematic representation of the task queue structures.
+ *
+ * taskq:
+ * +-------------+
+ * |tq_lock | +---< taskq_ent_free()
+ * +-------------+ |
+ * |... | | tqent: tqent:
+ * +-------------+ | +------------+ +------------+
+ * | tq_freelist |-->| tqent_next |--> ... ->| tqent_next |
+ * +-------------+ +------------+ +------------+
+ * |... | | ... | | ... |
+ * +-------------+ +------------+ +------------+
+ * | tq_task | |
+ * | | +-------------->taskq_ent_alloc()
+ * +--------------------------------------------------------------------------+
+ * | | | tqent tqent |
+ * | +---------------------+ +--> +------------+ +--> +------------+ |
+ * | | ... | | | func, arg | | | func, arg | |
+ * +>+---------------------+ <---|-+ +------------+ <---|-+ +------------+ |
+ * | tq_taskq.tqent_next | ----+ | | tqent_next | --->+ | | tqent_next |--+
+ * +---------------------+ | +------------+ ^ | +------------+
+ * +-| tq_task.tqent_prev | +--| tqent_prev | | +--| tqent_prev | ^
+ * | +---------------------+ +------------+ | +------------+ |
+ * | |... | | ... | | | ... | |
+ * | +---------------------+ +------------+ | +------------+ |
+ * | ^ | |
+ * | | | |
+ * +--------------------------------------+--------------+ TQ_APPEND() -+
+ * | | |
+ * |... | taskq_thread()-----+
+ * +-------------+
+ * | tq_buckets |--+-------> [ NULL ] (for regular task queues)
+ * +-------------+ |
+ * | DYNAMIC TASK QUEUES:
+ * |
+ * +-> taskq_bucket[nCPU] taskq_bucket_dispatch()
+ * +-------------------+ ^
+ * +--->| tqbucket_lock | |
+ * | +-------------------+ +--------+ +--------+
+ * | | tqbucket_freelist |-->| tqent |-->...| tqent | ^
+ * | +-------------------+<--+--------+<--...+--------+ |
+ * | | ... | | thread | | thread | |
+ * | +-------------------+ +--------+ +--------+ |
+ * | +-------------------+ |
+ * taskq_dispatch()--+--->| tqbucket_lock | TQ_APPEND()------+
+ * TQ_HASH() | +-------------------+ +--------+ +--------+
+ * | | tqbucket_freelist |-->| tqent |-->...| tqent |
+ * | +-------------------+<--+--------+<--...+--------+
+ * | | ... | | thread | | thread |
+ * | +-------------------+ +--------+ +--------+
+ * +---> ...
+ *
+ *
+ * Task queues use tq_task field to link new entry in the queue. The queue is a
+ * circular doubly-linked list. Entries are put in the end of the list with
+ * TQ_APPEND() and processed from the front of the list by taskq_thread() in
+ * FIFO order. Task queue entries are cached in the free list managed by
+ * taskq_ent_alloc() and taskq_ent_free() functions.
+ *
+ * All threads used by task queues mark t_taskq field of the thread to
+ * point to the task queue.
+ *
+ * Dynamic Task Queues Implementation.
+ *
+ * For a dynamic task queues there is a 1-to-1 mapping between a thread and
+ * taskq_ent_structure. Each entry is serviced by its own thread and each thread
+ * is controlled by a single entry.
+ *
+ * Entries are distributed over a set of buckets. To avoid using modulo
+ * arithmetics the number of buckets is 2^n and is determined as the nearest
+ * power of two roundown of the number of CPUs in the system. Tunable
+ * variable 'taskq_maxbuckets' limits the maximum number of buckets. Each entry
+ * is attached to a bucket for its lifetime and can't migrate to other buckets.
+ *
+ * Entries that have scheduled tasks are not placed in any list. The dispatch
+ * function sets their "func" and "arg" fields and signals the corresponding
+ * thread to execute the task. Once the thread executes the task it clears the
+ * "func" field and places an entry on the bucket cache of free entries pointed
+ * by "tqbucket_freelist" field. ALL entries on the free list should have "func"
+ * field equal to NULL. The free list is a circular doubly-linked list identical
+ * in structure to the tq_task list above, but entries are taken from it in LIFO
+ * order - the last freed entry is the first to be allocated. The
+ * taskq_bucket_dispatch() function gets the most recently used entry from the
+ * free list, sets its "func" and "arg" fields and signals a worker thread.
+ *
+ * After executing each task a per-entry thread taskq_d_thread() places its
+ * entry on the bucket free list and goes to a timed sleep. If it wakes up
+ * without getting new task it removes the entry from the free list and destroys
+ * itself. The thread sleep time is controlled by a tunable variable
+ * `taskq_thread_timeout'.
+ *
+ * There is various statistics kept in the bucket which allows for later
+ * analysis of taskq usage patterns. Also, a global copy of taskq creation and
+ * death statistics is kept in the global taskq data structure. Since thread
+ * creation and death happen rarely, updating such global data does not present
+ * a performance problem.
+ *
+ * NOTE: Threads are not bound to any CPU and there is absolutely no association
+ * between the bucket and actual thread CPU, so buckets are used only to
+ * split resources and reduce resource contention. Having threads attached
+ * to the CPU denoted by a bucket may reduce number of times the job
+ * switches between CPUs.
+ *
+ * Current algorithm creates a thread whenever a bucket has no free
+ * entries. It would be nice to know how many threads are in the running
+ * state and don't create threads if all CPUs are busy with existing
+ * tasks, but it is unclear how such strategy can be implemented.
+ *
+ * Currently buckets are created statically as an array attached to task
+ * queue. On some system with nCPUs < max_ncpus it may waste system
+ * memory. One solution may be allocation of buckets when they are first
+ * touched, but it is not clear how useful it is.
+ *
+ * SUSPEND/RESUME implementation.
+ *
+ * Before executing a task taskq_thread() (executing non-dynamic task
+ * queues) obtains taskq's thread lock as a reader. The taskq_suspend()
+ * function gets the same lock as a writer blocking all non-dynamic task
+ * execution. The taskq_resume() function releases the lock allowing
+ * taskq_thread to continue execution.
+ *
+ * For dynamic task queues, each bucket is marked as TQBUCKET_SUSPEND by
+ * taskq_suspend() function. After that taskq_bucket_dispatch() always
+ * fails, so that taskq_dispatch() will either enqueue tasks for a
+ * suspended backing queue or fail if TQ_NOQUEUE is specified in dispatch
+ * flags.
+ *
+ * NOTE: taskq_suspend() does not immediately block any tasks already
+ * scheduled for dynamic task queues. It only suspends new tasks
+ * scheduled after taskq_suspend() was called.
+ *
+ * taskq_member() function works by comparing a thread t_taskq pointer with
+ * the passed thread pointer.
+ *
+ * LOCKS and LOCK Hierarchy:
+ *
+ * There are two locks used in task queues.
+ *
+ * 1) Task queue structure has a lock, protecting global task queue state.
+ *
+ * 2) Each per-CPU bucket has a lock for bucket management.
+ *
+ * If both locks are needed, task queue lock should be taken only after bucket
+ * lock.
+ *
+ * DEBUG FACILITIES.
+ *
+ * For DEBUG kernels it is possible to induce random failures to
+ * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of
+ * taskq_dmtbf and taskq_smtbf tunables control the mean time between induced
+ * failures for dynamic and static task queues respectively.
+ *
+ * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics.
+ *
+ * TUNABLES
+ *
+ * system_taskq_size - Size of the global system_taskq.
+ * This value is multiplied by nCPUs to determine
+ * actual size.
+ * Default value: 64
+ *
+ * taskq_thread_timeout - Maximum idle time for taskq_d_thread()
+ * Default value: 5 minutes
+ *
+ * taskq_maxbuckets - Maximum number of buckets in any task queue
+ * Default value: 128
+ *
+ * taskq_search_depth - Maximum # of buckets searched for a free entry
+ * Default value: 4
+ *
+ * taskq_dmtbf - Mean time between induced dispatch failures
+ * for dynamic task queues.
+ * Default value: UINT_MAX (no induced failures)
+ *
+ * taskq_smtbf - Mean time between induced dispatch failures
+ * for static task queues.
+ * Default value: UINT_MAX (no induced failures)
+ *
+ * CONDITIONAL compilation.
+ *
+ * TASKQ_STATISTIC - If set will enable bucket statistic (default).
+ *
+ */
+
+#include <sys/taskq_impl.h>
+#include <sys/proc.h>
+#include <sys/kmem.h>
+#include <sys/callb.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/sdt.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+
+static kmem_cache_t *taskq_ent_cache, *taskq_cache;
+
+/* Global system task queue for common use */
+taskq_t *system_taskq;
+
+/*
+ * Maxmimum number of entries in global system taskq is
+ * system_taskq_size * max_ncpus
+ */
+#define SYSTEM_TASKQ_SIZE 64
+int system_taskq_size = SYSTEM_TASKQ_SIZE;
+
+/*
+ * Dynamic task queue threads that don't get any work within
+ * taskq_thread_timeout destroy themselves
+ */
+#define TASKQ_THREAD_TIMEOUT (60 * 5)
+int taskq_thread_timeout = TASKQ_THREAD_TIMEOUT;
+
+#define TASKQ_MAXBUCKETS 128
+int taskq_maxbuckets = TASKQ_MAXBUCKETS;
+
+/*
+ * When a bucket has no available entries another buckets are tried.
+ * taskq_search_depth parameter limits the amount of buckets that we search
+ * before failing. This is mostly useful in systems with many CPUs where we may
+ * spend too much time scanning busy buckets.
+ */
+#define TASKQ_SEARCH_DEPTH 4
+int taskq_search_depth = TASKQ_SEARCH_DEPTH;
+
+/*
+ * Hashing function: mix various bits of x. May be pretty much anything.
+ */
+#define TQ_HASH(x) ((x) ^ ((x) >> 11) ^ ((x) >> 17) ^ ((x) ^ 27))
+
+/*
+ * We do not create any new threads when the system is low on memory and start
+ * throttling memory allocations. The following macro tries to estimate such
+ * condition.
+ */
+#define ENOUGH_MEMORY() (freemem > throttlefree)
+
+/*
+ * Static functions.
+ */
+static taskq_t *taskq_create_common(const char *, int, int, pri_t, int,
+ int, uint_t);
+static void taskq_thread(void *);
+static int taskq_constructor(void *, void *, int);
+static void taskq_destructor(void *, void *);
+static int taskq_ent_constructor(void *, void *, int);
+static void taskq_ent_destructor(void *, void *);
+static taskq_ent_t *taskq_ent_alloc(taskq_t *, int);
+static void taskq_ent_free(taskq_t *, taskq_ent_t *);
+
+/*
+ * Collect per-bucket statistic when TASKQ_STATISTIC is defined.
+ */
+#define TASKQ_STATISTIC 1
+
+#if TASKQ_STATISTIC
+#define TQ_STAT(b, x) b->tqbucket_stat.x++
+#else
+#define TQ_STAT(b, x)
+#endif
+
+/*
+ * Random fault injection.
+ */
+uint_t taskq_random;
+uint_t taskq_dmtbf = UINT_MAX; /* mean time between injected failures */
+uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */
+
+/*
+ * TQ_NOSLEEP dispatches on dynamic task queues are always allowed to fail.
+ *
+ * TQ_NOSLEEP dispatches on static task queues can't arbitrarily fail because
+ * they could prepopulate the cache and make sure that they do not use more
+ * then minalloc entries. So, fault injection in this case insures that
+ * either TASKQ_PREPOPULATE is not set or there are more entries allocated
+ * than is specified by minalloc. TQ_NOALLOC dispatches are always allowed
+ * to fail, but for simplicity we treat them identically to TQ_NOSLEEP
+ * dispatches.
+ */
+#ifdef DEBUG
+#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) \
+ taskq_random = (taskq_random * 2416 + 374441) % 1771875;\
+ if ((flag & TQ_NOSLEEP) && \
+ taskq_random < 1771875 / taskq_dmtbf) { \
+ return (NULL); \
+ }
+
+#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) \
+ taskq_random = (taskq_random * 2416 + 374441) % 1771875;\
+ if ((flag & (TQ_NOSLEEP | TQ_NOALLOC)) && \
+ (!(tq->tq_flags & TASKQ_PREPOPULATE) || \
+ (tq->tq_nalloc > tq->tq_minalloc)) && \
+ (taskq_random < (1771875 / taskq_smtbf))) { \
+ mutex_exit(&tq->tq_lock); \
+ return ((taskqid_t)0); \
+ }
+#else
+#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag)
+#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag)
+#endif
+
+#define IS_EMPTY(l) (((l).tqent_prev == (l).tqent_next) && \
+ ((l).tqent_prev == &(l)))
+
+/*
+ * Append `tqe' in the end of the doubly-linked list denoted by l.
+ */
+#define TQ_APPEND(l, tqe) { \
+ tqe->tqent_next = &l; \
+ tqe->tqent_prev = l.tqent_prev; \
+ tqe->tqent_next->tqent_prev = tqe; \
+ tqe->tqent_prev->tqent_next = tqe; \
+}
+
+/*
+ * Schedule a task specified by func and arg into the task queue entry tqe.
+ */
+#define TQ_ENQUEUE(tq, tqe, func, arg) { \
+ ASSERT(MUTEX_HELD(&tq->tq_lock)); \
+ TQ_APPEND(tq->tq_task, tqe); \
+ tqe->tqent_func = (func); \
+ tqe->tqent_arg = (arg); \
+ tq->tq_tasks++; \
+ if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \
+ tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed; \
+ cv_signal(&tq->tq_dispatch_cv); \
+ DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \
+}
+
+/*
+ * Do-nothing task which may be used to prepopulate thread caches.
+ */
+/*ARGSUSED*/
+void
+nulltask(void *unused)
+{
+}
+
+
+/*ARGSUSED*/
+static int
+taskq_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ taskq_t *tq = buf;
+
+ bzero(tq, sizeof (taskq_t));
+
+ mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
+ cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+
+ tq->tq_task.tqent_next = &tq->tq_task;
+ tq->tq_task.tqent_prev = &tq->tq_task;
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+taskq_destructor(void *buf, void *cdrarg)
+{
+ taskq_t *tq = buf;
+
+ mutex_destroy(&tq->tq_lock);
+ rw_destroy(&tq->tq_threadlock);
+ cv_destroy(&tq->tq_dispatch_cv);
+ cv_destroy(&tq->tq_wait_cv);
+}
+
+/*ARGSUSED*/
+static int
+taskq_ent_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ taskq_ent_t *tqe = buf;
+
+ tqe->tqent_thread = NULL;
+ cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+taskq_ent_destructor(void *buf, void *cdrarg)
+{
+ taskq_ent_t *tqe = buf;
+
+ ASSERT(tqe->tqent_thread == NULL);
+ cv_destroy(&tqe->tqent_cv);
+}
+
+/*
+ * Create global system dynamic task queue.
+ */
+void
+system_taskq_init(void)
+{
+ system_taskq = taskq_create_common("system_taskq", 0,
+ system_taskq_size * max_ncpus, minclsyspri, 4, 512,
+ TASKQ_PREPOPULATE);
+}
+
+void
+system_taskq_fini(void)
+{
+ taskq_destroy(system_taskq);
+}
+
+static void
+taskq_init(void *dummy __unused)
+{
+ taskq_ent_cache = kmem_cache_create("taskq_ent_cache",
+ sizeof (taskq_ent_t), 0, taskq_ent_constructor,
+ taskq_ent_destructor, NULL, NULL, NULL, 0);
+ taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t),
+ 0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0);
+ system_taskq_init();
+}
+
+static void
+taskq_fini(void *dummy __unused)
+{
+ system_taskq_fini();
+ kmem_cache_destroy(taskq_cache);
+ kmem_cache_destroy(taskq_ent_cache);
+}
+
+/*
+ * taskq_ent_alloc()
+ *
+ * Allocates a new taskq_ent_t structure either from the free list or from the
+ * cache. Returns NULL if it can't be allocated.
+ *
+ * Assumes: tq->tq_lock is held.
+ */
+static taskq_ent_t *
+taskq_ent_alloc(taskq_t *tq, int flags)
+{
+ int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+ taskq_ent_t *tqe;
+
+ ASSERT(MUTEX_HELD(&tq->tq_lock));
+
+ /*
+ * TQ_NOALLOC allocations are allowed to use the freelist, even if
+ * we are below tq_minalloc.
+ */
+ if ((tqe = tq->tq_freelist) != NULL &&
+ ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) {
+ tq->tq_freelist = tqe->tqent_next;
+ } else {
+ if (flags & TQ_NOALLOC)
+ return (NULL);
+
+ mutex_exit(&tq->tq_lock);
+ if (tq->tq_nalloc >= tq->tq_maxalloc) {
+ if (kmflags & KM_NOSLEEP) {
+ mutex_enter(&tq->tq_lock);
+ return (NULL);
+ }
+ /*
+ * We don't want to exceed tq_maxalloc, but we can't
+ * wait for other tasks to complete (and thus free up
+ * task structures) without risking deadlock with
+ * the caller. So, we just delay for one second
+ * to throttle the allocation rate.
+ */
+ delay(hz);
+ }
+ tqe = kmem_cache_alloc(taskq_ent_cache, kmflags);
+ mutex_enter(&tq->tq_lock);
+ if (tqe != NULL)
+ tq->tq_nalloc++;
+ }
+ return (tqe);
+}
+
+/*
+ * taskq_ent_free()
+ *
+ * Free taskq_ent_t structure by either putting it on the free list or freeing
+ * it to the cache.
+ *
+ * Assumes: tq->tq_lock is held.
+ */
+static void
+taskq_ent_free(taskq_t *tq, taskq_ent_t *tqe)
+{
+ ASSERT(MUTEX_HELD(&tq->tq_lock));
+
+ if (tq->tq_nalloc <= tq->tq_minalloc) {
+ tqe->tqent_next = tq->tq_freelist;
+ tq->tq_freelist = tqe;
+ } else {
+ tq->tq_nalloc--;
+ mutex_exit(&tq->tq_lock);
+ kmem_cache_free(taskq_ent_cache, tqe);
+ mutex_enter(&tq->tq_lock);
+ }
+}
+
+/*
+ * Dispatch a task.
+ *
+ * Assumes: func != NULL
+ *
+ * Returns: NULL if dispatch failed.
+ * non-NULL if task dispatched successfully.
+ * Actual return value is the pointer to taskq entry that was used to
+ * dispatch a task. This is useful for debugging.
+ */
+/* ARGSUSED */
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *tqe = NULL;
+
+ ASSERT(tq != NULL);
+ ASSERT(func != NULL);
+ ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+
+ /*
+ * TQ_NOQUEUE flag can't be used with non-dynamic task queues.
+ */
+ ASSERT(! (flags & TQ_NOQUEUE));
+
+ /*
+ * Enqueue the task to the underlying queue.
+ */
+ mutex_enter(&tq->tq_lock);
+
+ TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags);
+
+ if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) {
+ mutex_exit(&tq->tq_lock);
+ return ((taskqid_t)NULL);
+ }
+ TQ_ENQUEUE(tq, tqe, func, arg);
+ mutex_exit(&tq->tq_lock);
+ return ((taskqid_t)tqe);
+}
+
+/*
+ * Wait for all pending tasks to complete.
+ * Calling taskq_wait from a task will cause deadlock.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+
+ mutex_enter(&tq->tq_lock);
+ while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
+ cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+ mutex_exit(&tq->tq_lock);
+}
+
+/*
+ * Suspend execution of tasks.
+ *
+ * Tasks in the queue part will be suspended immediately upon return from this
+ * function. Pending tasks in the dynamic part will continue to execute, but all
+ * new tasks will be suspended.
+ */
+void
+taskq_suspend(taskq_t *tq)
+{
+ rw_enter(&tq->tq_threadlock, RW_WRITER);
+
+ /*
+ * Mark task queue as being suspended. Needed for taskq_suspended().
+ */
+ mutex_enter(&tq->tq_lock);
+ ASSERT(!(tq->tq_flags & TASKQ_SUSPENDED));
+ tq->tq_flags |= TASKQ_SUSPENDED;
+ mutex_exit(&tq->tq_lock);
+}
+
+/*
+ * returns: 1 if tq is suspended, 0 otherwise.
+ */
+int
+taskq_suspended(taskq_t *tq)
+{
+ return ((tq->tq_flags & TASKQ_SUSPENDED) != 0);
+}
+
+/*
+ * Resume taskq execution.
+ */
+void
+taskq_resume(taskq_t *tq)
+{
+ ASSERT(RW_WRITE_HELD(&tq->tq_threadlock));
+
+ mutex_enter(&tq->tq_lock);
+ ASSERT(tq->tq_flags & TASKQ_SUSPENDED);
+ tq->tq_flags &= ~TASKQ_SUSPENDED;
+ mutex_exit(&tq->tq_lock);
+
+ rw_exit(&tq->tq_threadlock);
+}
+
+/*
+ * Worker thread for processing task queue.
+ */
+static void
+taskq_thread(void *arg)
+{
+ taskq_t *tq = arg;
+ taskq_ent_t *tqe;
+ callb_cpr_t cprinfo;
+ hrtime_t start, end;
+
+ CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, tq->tq_name);
+
+ mutex_enter(&tq->tq_lock);
+ while (tq->tq_flags & TASKQ_ACTIVE) {
+ if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) {
+ if (--tq->tq_active == 0)
+ cv_broadcast(&tq->tq_wait_cv);
+ if (tq->tq_flags & TASKQ_CPR_SAFE) {
+ cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
+ } else {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &tq->tq_lock);
+ }
+ tq->tq_active++;
+ continue;
+ }
+ tqe->tqent_prev->tqent_next = tqe->tqent_next;
+ tqe->tqent_next->tqent_prev = tqe->tqent_prev;
+ mutex_exit(&tq->tq_lock);
+
+ rw_enter(&tq->tq_threadlock, RW_READER);
+ start = gethrtime();
+ DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq,
+ taskq_ent_t *, tqe);
+ tqe->tqent_func(tqe->tqent_arg);
+ DTRACE_PROBE2(taskq__exec__end, taskq_t *, tq,
+ taskq_ent_t *, tqe);
+ end = gethrtime();
+ rw_exit(&tq->tq_threadlock);
+
+ mutex_enter(&tq->tq_lock);
+ tq->tq_totaltime += end - start;
+ tq->tq_executed++;
+
+ taskq_ent_free(tq, tqe);
+ }
+ tq->tq_nthreads--;
+ cv_broadcast(&tq->tq_wait_cv);
+ ASSERT(!(tq->tq_flags & TASKQ_CPR_SAFE));
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+}
+
+/*
+ * Taskq creation. May sleep for memory.
+ * Always use automatically generated instances to avoid kstat name space
+ * collisions.
+ */
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri, int minalloc,
+ int maxalloc, uint_t flags)
+{
+ return taskq_create_common(name, 0, nthreads, pri, minalloc,
+ maxalloc, flags | TASKQ_NOINSTANCE);
+}
+
+static taskq_t *
+taskq_create_common(const char *name, int instance, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags)
+{
+ taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP);
+ uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus);
+ uint_t bsize; /* # of buckets - always power of 2 */
+
+ ASSERT(instance == 0);
+ ASSERT(flags == TASKQ_PREPOPULATE | TASKQ_NOINSTANCE);
+
+ /*
+ * TASKQ_CPR_SAFE and TASKQ_DYNAMIC flags are mutually exclusive.
+ */
+ ASSERT((flags & (TASKQ_DYNAMIC | TASKQ_CPR_SAFE)) !=
+ ((TASKQ_DYNAMIC | TASKQ_CPR_SAFE)));
+
+ ASSERT(tq->tq_buckets == NULL);
+
+ bsize = 1 << (highbit(ncpus) - 1);
+ ASSERT(bsize >= 1);
+ bsize = MIN(bsize, taskq_maxbuckets);
+
+ tq->tq_maxsize = nthreads;
+
+ (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
+ tq->tq_name[TASKQ_NAMELEN] = '\0';
+ /* Make sure the name conforms to the rules for C indentifiers */
+ strident_canon(tq->tq_name, TASKQ_NAMELEN);
+
+ tq->tq_flags = flags | TASKQ_ACTIVE;
+ tq->tq_active = nthreads;
+ tq->tq_nthreads = nthreads;
+ tq->tq_minalloc = minalloc;
+ tq->tq_maxalloc = maxalloc;
+ tq->tq_nbuckets = bsize;
+ tq->tq_pri = pri;
+
+ if (flags & TASKQ_PREPOPULATE) {
+ mutex_enter(&tq->tq_lock);
+ while (minalloc-- > 0)
+ taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
+ mutex_exit(&tq->tq_lock);
+ }
+
+ if (nthreads == 1) {
+ tq->tq_thread = thread_create(NULL, 0, taskq_thread, tq,
+ 0, NULL, TS_RUN, pri);
+ } else {
+ kthread_t **tpp = kmem_alloc(sizeof (kthread_t *) * nthreads,
+ KM_SLEEP);
+
+ tq->tq_threadlist = tpp;
+
+ mutex_enter(&tq->tq_lock);
+ while (nthreads-- > 0) {
+ *tpp = thread_create(NULL, 0, taskq_thread, tq,
+ 0, NULL, TS_RUN, pri);
+ tpp++;
+ }
+ mutex_exit(&tq->tq_lock);
+ }
+
+ return (tq);
+}
+
+/*
+ * taskq_destroy().
+ *
+ * Assumes: by the time taskq_destroy is called no one will use this task queue
+ * in any way and no one will try to dispatch entries in it.
+ */
+void
+taskq_destroy(taskq_t *tq)
+{
+ taskq_bucket_t *b = tq->tq_buckets;
+ int bid = 0;
+
+ ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE));
+
+ /*
+ * Wait for any pending entries to complete.
+ */
+ taskq_wait(tq);
+
+ mutex_enter(&tq->tq_lock);
+ ASSERT((tq->tq_task.tqent_next == &tq->tq_task) &&
+ (tq->tq_active == 0));
+
+ if ((tq->tq_nthreads > 1) && (tq->tq_threadlist != NULL))
+ kmem_free(tq->tq_threadlist, sizeof (kthread_t *) *
+ tq->tq_nthreads);
+
+ tq->tq_flags &= ~TASKQ_ACTIVE;
+ cv_broadcast(&tq->tq_dispatch_cv);
+ while (tq->tq_nthreads != 0)
+ cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+
+ tq->tq_minalloc = 0;
+ while (tq->tq_nalloc != 0)
+ taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
+
+ mutex_exit(&tq->tq_lock);
+
+ /*
+ * Mark each bucket as closing and wakeup all sleeping threads.
+ */
+ for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) {
+ taskq_ent_t *tqe;
+
+ mutex_enter(&b->tqbucket_lock);
+
+ b->tqbucket_flags |= TQBUCKET_CLOSE;
+ /* Wakeup all sleeping threads */
+
+ for (tqe = b->tqbucket_freelist.tqent_next;
+ tqe != &b->tqbucket_freelist; tqe = tqe->tqent_next)
+ cv_signal(&tqe->tqent_cv);
+
+ ASSERT(b->tqbucket_nalloc == 0);
+
+ /*
+ * At this point we waited for all pending jobs to complete (in
+ * both the task queue and the bucket and no new jobs should
+ * arrive. Wait for all threads to die.
+ */
+ while (b->tqbucket_nfree > 0)
+ cv_wait(&b->tqbucket_cv, &b->tqbucket_lock);
+ mutex_exit(&b->tqbucket_lock);
+ mutex_destroy(&b->tqbucket_lock);
+ cv_destroy(&b->tqbucket_cv);
+ }
+
+ if (tq->tq_buckets != NULL) {
+ ASSERT(tq->tq_flags & TASKQ_DYNAMIC);
+ kmem_free(tq->tq_buckets,
+ sizeof (taskq_bucket_t) * tq->tq_nbuckets);
+
+ /* Cleanup fields before returning tq to the cache */
+ tq->tq_buckets = NULL;
+ tq->tq_tcreates = 0;
+ tq->tq_tdeaths = 0;
+ } else {
+ ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+ }
+
+ tq->tq_totaltime = 0;
+ tq->tq_tasks = 0;
+ tq->tq_maxtasks = 0;
+ tq->tq_executed = 0;
+ kmem_cache_free(taskq_cache, tq);
+}
+
+SYSINIT(sol_taskq, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, taskq_init, NULL)
+SYSUNINIT(sol_taskq, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, taskq_fini, NULL);
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr.c b/sys/contrib/opensolaris/uts/common/rpc/xdr.c
new file mode 100644
index 000000000000..65b73b6181a9
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/rpc/xdr.c
@@ -0,0 +1,671 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * xdr.c, generic XDR routines implementation.
+ * These are the "generic" xdr routines used to serialize and de-serialize
+ * most common data items. See xdr.h for more info on the interface to
+ * xdr.
+ */
+
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <rpc/types.h>
+#include <rpc/xdr.h>
+
+#pragma weak xdr_int32_t = xdr_int
+#pragma weak xdr_uint32_t = xdr_u_int
+#pragma weak xdr_int64_t = xdr_longlong_t
+#pragma weak xdr_uint64_t = xdr_u_longlong_t
+
+#if !defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)
+#error "Exactly one of _BIG_ENDIAN or _LITTLE_ENDIAN must be defined"
+#elif defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN)
+#error "Only one of _BIG_ENDIAN or _LITTLE_ENDIAN may be defined"
+#endif
+
+/*
+ * constants specific to the xdr "protocol"
+ */
+#define XDR_FALSE ((int32_t)0)
+#define XDR_TRUE ((int32_t)1)
+#define LASTUNSIGNED ((uint_t)0-1)
+
+/*
+ * for unit alignment
+ */
+static char xdr_zero[BYTES_PER_XDR_UNIT] = { 0, 0, 0, 0 };
+
+/*
+ * Free a data structure using XDR
+ * Not a filter, but a convenient utility nonetheless
+ */
+void
+xdr_free(xdrproc_t proc, char *objp)
+{
+ XDR x;
+
+ x.x_op = XDR_FREE;
+ (*proc)(&x, objp);
+}
+
+/*
+ * XDR nothing
+ */
+bool_t
+xdr_void(void)
+{
+ return (TRUE);
+}
+
+/*
+ * XDR integers
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_int
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+bool_t
+xdr_int(XDR *xdrs, int *ip)
+{
+ if (xdrs->x_op == XDR_ENCODE)
+ return (XDR_PUTINT32(xdrs, ip));
+
+ if (xdrs->x_op == XDR_DECODE)
+ return (XDR_GETINT32(xdrs, ip));
+
+ if (xdrs->x_op == XDR_FREE)
+ return (TRUE);
+
+#ifdef DEBUG
+ printf("xdr_int: FAILED\n");
+#endif
+ return (FALSE);
+}
+
+/*
+ * XDR unsigned integers
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_u_int
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+bool_t
+xdr_u_int(XDR *xdrs, uint_t *up)
+{
+ if (xdrs->x_op == XDR_ENCODE)
+ return (XDR_PUTINT32(xdrs, (int32_t *)up));
+
+ if (xdrs->x_op == XDR_DECODE)
+ return (XDR_GETINT32(xdrs, (int32_t *)up));
+
+ if (xdrs->x_op == XDR_FREE)
+ return (TRUE);
+
+#ifdef DEBUG
+ printf("xdr_int: FAILED\n");
+#endif
+ return (FALSE);
+}
+
+
+#if defined(_ILP32)
+/*
+ * xdr_long and xdr_u_long for binary compatability on ILP32 kernels.
+ *
+ * No prototypes since new code should not be using these interfaces.
+ */
+bool_t
+xdr_long(XDR *xdrs, long *ip)
+{
+ return (xdr_int(xdrs, (int *)ip));
+}
+
+bool_t
+xdr_u_long(XDR *xdrs, unsigned long *up)
+{
+ return (xdr_u_int(xdrs, (uint_t *)up));
+}
+#endif /* _ILP32 */
+
+
+/*
+ * XDR long long integers
+ */
+bool_t
+xdr_longlong_t(XDR *xdrs, longlong_t *hp)
+{
+ if (xdrs->x_op == XDR_ENCODE) {
+#if defined(_LITTLE_ENDIAN)
+ if (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)) == TRUE) {
+ return (XDR_PUTINT32(xdrs, (int32_t *)hp));
+ }
+#elif defined(_BIG_ENDIAN)
+ if (XDR_PUTINT32(xdrs, (int32_t *)hp) == TRUE) {
+ return (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)));
+ }
+#endif
+ return (FALSE);
+
+ }
+ if (xdrs->x_op == XDR_DECODE) {
+#if defined(_LITTLE_ENDIAN)
+ if (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)) == TRUE) {
+ return (XDR_GETINT32(xdrs, (int32_t *)hp));
+ }
+#elif defined(_BIG_ENDIAN)
+ if (XDR_GETINT32(xdrs, (int32_t *)hp) == TRUE) {
+ return (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)));
+ }
+#endif
+ return (FALSE);
+ }
+ return (TRUE);
+}
+
+/*
+ * XDR unsigned long long integers
+ */
+bool_t
+xdr_u_longlong_t(XDR *xdrs, u_longlong_t *hp)
+{
+
+ if (xdrs->x_op == XDR_ENCODE) {
+#if defined(_LITTLE_ENDIAN)
+ if (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)) == TRUE) {
+ return (XDR_PUTINT32(xdrs, (int32_t *)hp));
+ }
+#elif defined(_BIG_ENDIAN)
+ if (XDR_PUTINT32(xdrs, (int32_t *)hp) == TRUE) {
+ return (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)));
+ }
+#endif
+ return (FALSE);
+
+ }
+ if (xdrs->x_op == XDR_DECODE) {
+#if defined(_LITTLE_ENDIAN)
+ if (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)) == TRUE) {
+ return (XDR_GETINT32(xdrs, (int32_t *)hp));
+ }
+#elif defined(_BIG_ENDIAN)
+ if (XDR_GETINT32(xdrs, (int32_t *)hp) == TRUE) {
+ return (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
+ BYTES_PER_XDR_UNIT)));
+ }
+#endif
+ return (FALSE);
+ }
+ return (TRUE);
+}
+
+/*
+ * XDR short integers
+ */
+bool_t
+xdr_short(XDR *xdrs, short *sp)
+{
+ int32_t l;
+
+ switch (xdrs->x_op) {
+
+ case XDR_ENCODE:
+ l = (int32_t)*sp;
+ return (XDR_PUTINT32(xdrs, &l));
+
+ case XDR_DECODE:
+ if (!XDR_GETINT32(xdrs, &l))
+ return (FALSE);
+ *sp = (short)l;
+ return (TRUE);
+
+ case XDR_FREE:
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+/*
+ * XDR unsigned short integers
+ */
+bool_t
+xdr_u_short(XDR *xdrs, ushort_t *usp)
+{
+ uint32_t l;
+
+ switch (xdrs->x_op) {
+
+ case XDR_ENCODE:
+ l = (uint32_t)*usp;
+ return (XDR_PUTINT32(xdrs, (int32_t *)&l));
+
+ case XDR_DECODE:
+ if (!XDR_GETINT32(xdrs, (int32_t *)&l)) {
+#ifdef DEBUG
+ printf("xdr_u_short: decode FAILED\n");
+#endif
+ return (FALSE);
+ }
+ *usp = (ushort_t)l;
+ return (TRUE);
+
+ case XDR_FREE:
+ return (TRUE);
+ }
+#ifdef DEBUG
+ printf("xdr_u_short: bad op FAILED\n");
+#endif
+ return (FALSE);
+}
+
+
+/*
+ * XDR a char
+ */
+bool_t
+xdr_char(XDR *xdrs, char *cp)
+{
+ int i;
+
+ i = (*cp);
+ if (!xdr_int(xdrs, &i)) {
+ return (FALSE);
+ }
+ *cp = (char)i;
+ return (TRUE);
+}
+
+/*
+ * XDR booleans
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_bool
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+bool_t
+xdr_bool(XDR *xdrs, bool_t *bp)
+{
+ int32_t i32b;
+
+ switch (xdrs->x_op) {
+
+ case XDR_ENCODE:
+ i32b = *bp ? XDR_TRUE : XDR_FALSE;
+ return (XDR_PUTINT32(xdrs, &i32b));
+
+ case XDR_DECODE:
+ if (!XDR_GETINT32(xdrs, &i32b)) {
+#ifdef DEBUG
+ printf("xdr_bool: decode FAILED\n");
+#endif
+ return (FALSE);
+ }
+ *bp = (i32b == XDR_FALSE) ? FALSE : TRUE;
+ return (TRUE);
+
+ case XDR_FREE:
+ return (TRUE);
+ }
+#ifdef DEBUG
+ printf("xdr_bool: bad op FAILED\n");
+#endif
+ return (FALSE);
+}
+
+/*
+ * XDR enumerations
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_enum
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+#ifndef lint
+enum sizecheck { SIZEVAL } sizecheckvar; /* used to find the size of */
+ /* an enum */
+#endif
+bool_t
+xdr_enum(XDR *xdrs, enum_t *ep)
+{
+#ifndef lint
+ /*
+ * enums are treated as ints
+ */
+ if (sizeof (sizecheckvar) == sizeof (int32_t)) {
+ return (xdr_int(xdrs, (int32_t *)ep));
+ } else if (sizeof (sizecheckvar) == sizeof (short)) {
+ return (xdr_short(xdrs, (short *)ep));
+ } else {
+ return (FALSE);
+ }
+#else
+ (void) (xdr_short(xdrs, (short *)ep));
+ return (xdr_int(xdrs, (int32_t *)ep));
+#endif
+}
+
+/*
+ * XDR opaque data
+ * Allows the specification of a fixed size sequence of opaque bytes.
+ * cp points to the opaque object and cnt gives the byte length.
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_opaque
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+bool_t
+xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ uint_t rndup;
+ static char crud[BYTES_PER_XDR_UNIT];
+
+ /*
+ * if no data we are done
+ */
+ if (cnt == 0)
+ return (TRUE);
+
+ /*
+ * round byte count to full xdr units
+ */
+ rndup = cnt % BYTES_PER_XDR_UNIT;
+ if (rndup != 0)
+ rndup = BYTES_PER_XDR_UNIT - rndup;
+
+ if (xdrs->x_op == XDR_DECODE) {
+ if (!XDR_GETBYTES(xdrs, cp, cnt)) {
+#ifdef DEBUG
+ printf("xdr_opaque: decode FAILED\n");
+#endif
+ return (FALSE);
+ }
+ if (rndup == 0)
+ return (TRUE);
+ return (XDR_GETBYTES(xdrs, (caddr_t)crud, rndup));
+ }
+
+ if (xdrs->x_op == XDR_ENCODE) {
+ if (!XDR_PUTBYTES(xdrs, cp, cnt)) {
+#ifdef DEBUG
+ printf("xdr_opaque: encode FAILED\n");
+#endif
+ return (FALSE);
+ }
+ if (rndup == 0)
+ return (TRUE);
+ return (XDR_PUTBYTES(xdrs, xdr_zero, rndup));
+ }
+
+ if (xdrs->x_op == XDR_FREE)
+ return (TRUE);
+
+#ifdef DEBUG
+ printf("xdr_opaque: bad op FAILED\n");
+#endif
+ return (FALSE);
+}
+
+/*
+ * XDR counted bytes
+ * *cpp is a pointer to the bytes, *sizep is the count.
+ * If *cpp is NULL maxsize bytes are allocated
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_bytes
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+bool_t
+xdr_bytes(XDR *xdrs, char **cpp, uint_t *sizep, const uint_t maxsize)
+{
+ char *sp = *cpp; /* sp is the actual string pointer */
+ uint_t nodesize;
+
+ /*
+ * first deal with the length since xdr bytes are counted
+ */
+ if (!xdr_u_int(xdrs, sizep)) {
+#ifdef DEBUG
+ printf("xdr_bytes: size FAILED\n");
+#endif
+ return (FALSE);
+ }
+ nodesize = *sizep;
+ if ((nodesize > maxsize) && (xdrs->x_op != XDR_FREE)) {
+#ifdef DEBUG
+ printf("xdr_bytes: bad size (%d) FAILED (%d max)\n",
+ nodesize, maxsize);
+#endif
+ return (FALSE);
+ }
+
+ /*
+ * now deal with the actual bytes
+ */
+ switch (xdrs->x_op) {
+ case XDR_DECODE:
+ if (nodesize == 0)
+ return (TRUE);
+ if (sp == NULL)
+ *cpp = sp = (char *)mem_alloc(nodesize);
+ /* FALLTHROUGH */
+
+ case XDR_ENCODE:
+ return (xdr_opaque(xdrs, sp, nodesize));
+
+ case XDR_FREE:
+ if (sp != NULL) {
+ mem_free(sp, nodesize);
+ *cpp = NULL;
+ }
+ return (TRUE);
+ }
+#ifdef DEBUG
+ printf("xdr_bytes: bad op FAILED\n");
+#endif
+ return (FALSE);
+}
+
+/*
+ * Implemented here due to commonality of the object.
+ */
+bool_t
+xdr_netobj(XDR *xdrs, struct netobj *np)
+{
+ return (xdr_bytes(xdrs, &np->n_bytes, &np->n_len, MAX_NETOBJ_SZ));
+}
+
+/*
+ * XDR a descriminated union
+ * Support routine for discriminated unions.
+ * You create an array of xdrdiscrim structures, terminated with
+ * an entry with a null procedure pointer. The routine gets
+ * the discriminant value and then searches the array of xdrdiscrims
+ * looking for that value. It calls the procedure given in the xdrdiscrim
+ * to handle the discriminant. If there is no specific routine a default
+ * routine may be called.
+ * If there is no specific or default routine an error is returned.
+ */
+bool_t
+xdr_union(XDR *xdrs, enum_t *dscmp, char *unp,
+ const struct xdr_discrim *choices, const xdrproc_t dfault)
+{
+ enum_t dscm;
+
+ /*
+ * we deal with the discriminator; it's an enum
+ */
+ if (!xdr_enum(xdrs, dscmp)) {
+#ifdef DEBUG
+ printf("xdr_enum: dscmp FAILED\n");
+#endif
+ return (FALSE);
+ }
+ dscm = *dscmp;
+
+ /*
+ * search choices for a value that matches the discriminator.
+ * if we find one, execute the xdr routine for that value.
+ */
+ for (; choices->proc != NULL_xdrproc_t; choices++) {
+ if (choices->value == dscm)
+ return ((*(choices->proc))(xdrs, unp, LASTUNSIGNED));
+ }
+
+ /*
+ * no match - execute the default xdr routine if there is one
+ */
+ return ((dfault == NULL_xdrproc_t) ? FALSE :
+ (*dfault)(xdrs, unp, LASTUNSIGNED));
+}
+
+
+/*
+ * Non-portable xdr primitives.
+ * Care should be taken when moving these routines to new architectures.
+ */
+
+
+/*
+ * XDR null terminated ASCII strings
+ * xdr_string deals with "C strings" - arrays of bytes that are
+ * terminated by a NULL character. The parameter cpp references a
+ * pointer to storage; If the pointer is null, then the necessary
+ * storage is allocated. The last parameter is the max allowed length
+ * of the string as specified by a protocol.
+ */
+bool_t
+xdr_string(XDR *xdrs, char **cpp, const uint_t maxsize)
+{
+ char *sp = *cpp; /* sp is the actual string pointer */
+ uint_t size;
+ uint_t nodesize;
+
+ /*
+ * first deal with the length since xdr strings are counted-strings
+ */
+ switch (xdrs->x_op) {
+ case XDR_FREE:
+ if (sp == NULL)
+ return (TRUE); /* already free */
+ /* FALLTHROUGH */
+ case XDR_ENCODE:
+ size = (sp != NULL) ? (uint_t)strlen(sp) : 0;
+ break;
+ case XDR_DECODE:
+ break;
+ }
+ if (!xdr_u_int(xdrs, &size)) {
+#ifdef DEBUG
+ printf("xdr_string: size FAILED\n");
+#endif
+ return (FALSE);
+ }
+ if (size > maxsize) {
+#ifdef DEBUG
+ printf("xdr_string: bad size FAILED\n");
+#endif
+ return (FALSE);
+ }
+ nodesize = size + 1;
+
+ /*
+ * now deal with the actual bytes
+ */
+ switch (xdrs->x_op) {
+ case XDR_DECODE:
+ if (nodesize == 0)
+ return (TRUE);
+ if (sp == NULL)
+ sp = (char *)mem_alloc(nodesize);
+ sp[size] = 0;
+ if (!xdr_opaque(xdrs, sp, size)) {
+ /*
+ * free up memory if allocated here
+ */
+ if (*cpp == NULL) {
+ mem_free(sp, nodesize);
+ }
+ return (FALSE);
+ }
+ if (strlen(sp) != size) {
+ if (*cpp == NULL) {
+ mem_free(sp, nodesize);
+ }
+ return (FALSE);
+ }
+ *cpp = sp;
+ return (TRUE);
+
+ case XDR_ENCODE:
+ return (xdr_opaque(xdrs, sp, size));
+
+ case XDR_FREE:
+ mem_free(sp, nodesize);
+ *cpp = NULL;
+ return (TRUE);
+ }
+#ifdef DEBUG
+ printf("xdr_string: bad op FAILED\n");
+#endif
+ return (FALSE);
+}
+
+/*
+ * Wrapper for xdr_string that can be called directly from
+ * routines like clnt_call
+ */
+bool_t
+xdr_wrapstring(XDR *xdrs, char **cpp)
+{
+ if (xdr_string(xdrs, cpp, LASTUNSIGNED))
+ return (TRUE);
+ return (FALSE);
+}
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr.h b/sys/contrib/opensolaris/uts/common/rpc/xdr.h
new file mode 100644
index 000000000000..c8eb419fdf1f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/rpc/xdr.h
@@ -0,0 +1,605 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+/*
+ * Portions of this source code were derived from Berkeley
+ * 4.3 BSD under license from the Regents of the University of
+ * California.
+ */
+
+/*
+ * xdr.h, External Data Representation Serialization Routines.
+ *
+ */
+
+#ifndef _RPC_XDR_H
+#define _RPC_XDR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/byteorder.h> /* For all ntoh* and hton*() kind of macros */
+#include <rpc/types.h> /* For all ntoh* and hton*() kind of macros */
+#ifndef _KERNEL
+#include <stdio.h> /* defines FILE *, used in ANSI C function prototypes */
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * XDR provides a conventional way for converting between C data
+ * types and an external bit-string representation. Library supplied
+ * routines provide for the conversion on built-in C data types. These
+ * routines and utility routines defined here are used to help implement
+ * a type encode/decode routine for each user-defined type.
+ *
+ * Each data type provides a single procedure which takes two arguments:
+ *
+ * bool_t
+ * xdrproc(xdrs, argresp)
+ * XDR *xdrs;
+ * <type> *argresp;
+ *
+ * xdrs is an instance of a XDR handle, to which or from which the data
+ * type is to be converted. argresp is a pointer to the structure to be
+ * converted. The XDR handle contains an operation field which indicates
+ * which of the operations (ENCODE, DECODE * or FREE) is to be performed.
+ *
+ * XDR_DECODE may allocate space if the pointer argresp is null. This
+ * data can be freed with the XDR_FREE operation.
+ *
+ * We write only one procedure per data type to make it easy
+ * to keep the encode and decode procedures for a data type consistent.
+ * In many cases the same code performs all operations on a user defined type,
+ * because all the hard work is done in the component type routines.
+ * decode as a series of calls on the nested data types.
+ */
+
+/*
+ * Xdr operations. XDR_ENCODE causes the type to be encoded into the
+ * stream. XDR_DECODE causes the type to be extracted from the stream.
+ * XDR_FREE can be used to release the space allocated by an XDR_DECODE
+ * request.
+ */
+enum xdr_op {
+ XDR_ENCODE = 0,
+ XDR_DECODE = 1,
+ XDR_FREE = 2
+};
+
+/*
+ * This is the number of bytes per unit of external data.
+ */
+#define BYTES_PER_XDR_UNIT (4)
+#define RNDUP(x) ((((x) + BYTES_PER_XDR_UNIT - 1) / BYTES_PER_XDR_UNIT) \
+ * BYTES_PER_XDR_UNIT)
+
+/*
+ * The XDR handle.
+ * Contains operation which is being applied to the stream,
+ * an operations vector for the paticular implementation (e.g. see xdr_mem.c),
+ * and two private fields for the use of the particular impelementation.
+ *
+ * PSARC 2003/523 Contract Private Interface
+ * XDR
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+typedef struct XDR {
+ enum xdr_op x_op; /* operation; fast additional param */
+ struct xdr_ops *x_ops;
+ caddr_t x_public; /* users' data */
+ caddr_t x_private; /* pointer to private data */
+ caddr_t x_base; /* private used for position info */
+ int x_handy; /* extra private word */
+} XDR;
+
+/*
+ * PSARC 2003/523 Contract Private Interface
+ * xdr_ops
+ * Changes must be reviewed by Solaris File Sharing
+ * Changes must be communicated to contract-2003-523@sun.com
+ */
+struct xdr_ops {
+#ifdef __STDC__
+#if !defined(_KERNEL)
+ bool_t (*x_getlong)(struct XDR *, long *);
+ /* get a long from underlying stream */
+ bool_t (*x_putlong)(struct XDR *, long *);
+ /* put a long to " */
+#endif /* KERNEL */
+ bool_t (*x_getbytes)(struct XDR *, caddr_t, int);
+ /* get some bytes from " */
+ bool_t (*x_putbytes)(struct XDR *, caddr_t, int);
+ /* put some bytes to " */
+ uint_t (*x_getpostn)(struct XDR *);
+ /* returns bytes off from beginning */
+ bool_t (*x_setpostn)(struct XDR *, uint_t);
+ /* lets you reposition the stream */
+ rpc_inline_t *(*x_inline)(struct XDR *, int);
+ /* buf quick ptr to buffered data */
+ void (*x_destroy)(struct XDR *);
+ /* free privates of this xdr_stream */
+ bool_t (*x_control)(struct XDR *, int, void *);
+#if defined(_LP64) || defined(_KERNEL)
+ bool_t (*x_getint32)(struct XDR *, int32_t *);
+ /* get a int from underlying stream */
+ bool_t (*x_putint32)(struct XDR *, int32_t *);
+ /* put an int to " */
+#endif /* _LP64 || _KERNEL */
+#else
+#if !defined(_KERNEL)
+ bool_t (*x_getlong)(); /* get a long from underlying stream */
+ bool_t (*x_putlong)(); /* put a long to " */
+#endif /* KERNEL */
+ bool_t (*x_getbytes)(); /* get some bytes from " */
+ bool_t (*x_putbytes)(); /* put some bytes to " */
+ uint_t (*x_getpostn)(); /* returns bytes off from beginning */
+ bool_t (*x_setpostn)(); /* lets you reposition the stream */
+ rpc_inline_t *(*x_inline)();
+ /* buf quick ptr to buffered data */
+ void (*x_destroy)(); /* free privates of this xdr_stream */
+ bool_t (*x_control)();
+#if defined(_LP64) || defined(_KERNEL)
+ bool_t (*x_getint32)();
+ bool_t (*x_putint32)();
+#endif /* _LP64 || defined(_KERNEL) */
+#endif
+};
+
+/*
+ * Operations defined on a XDR handle
+ *
+ * XDR *xdrs;
+ * long *longp;
+ * caddr_t addr;
+ * uint_t len;
+ * uint_t pos;
+ */
+#if !defined(_KERNEL)
+#define XDR_GETLONG(xdrs, longp) \
+ (*(xdrs)->x_ops->x_getlong)(xdrs, longp)
+#define xdr_getlong(xdrs, longp) \
+ (*(xdrs)->x_ops->x_getlong)(xdrs, longp)
+
+#define XDR_PUTLONG(xdrs, longp) \
+ (*(xdrs)->x_ops->x_putlong)(xdrs, longp)
+#define xdr_putlong(xdrs, longp) \
+ (*(xdrs)->x_ops->x_putlong)(xdrs, longp)
+#endif /* KERNEL */
+
+
+#if !defined(_LP64) && !defined(_KERNEL)
+
+/*
+ * For binary compatability on ILP32 we do not change the shape
+ * of the XDR structure and the GET/PUTINT32 functions just use
+ * the get/putlong vectors which operate on identically-sized
+ * units of data.
+ */
+
+#define XDR_GETINT32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_getlong)(xdrs, (long *)int32p)
+#define xdr_getint32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_getlong)(xdrs, (long *)int32p)
+
+#define XDR_PUTINT32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_putlong)(xdrs, (long *)int32p)
+#define xdr_putint32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_putlong)(xdrs, (long *)int32p)
+
+#else /* !_LP64 && !_KERNEL */
+
+#define XDR_GETINT32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_getint32)(xdrs, int32p)
+#define xdr_getint32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_getint32)(xdrs, int32p)
+
+#define XDR_PUTINT32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_putint32)(xdrs, int32p)
+#define xdr_putint32(xdrs, int32p) \
+ (*(xdrs)->x_ops->x_putint32)(xdrs, int32p)
+
+#endif /* !_LP64 && !_KERNEL */
+
+#define XDR_GETBYTES(xdrs, addr, len) \
+ (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len)
+#define xdr_getbytes(xdrs, addr, len) \
+ (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len)
+
+#define XDR_PUTBYTES(xdrs, addr, len) \
+ (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len)
+#define xdr_putbytes(xdrs, addr, len) \
+ (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len)
+
+#define XDR_GETPOS(xdrs) \
+ (*(xdrs)->x_ops->x_getpostn)(xdrs)
+#define xdr_getpos(xdrs) \
+ (*(xdrs)->x_ops->x_getpostn)(xdrs)
+
+#define XDR_SETPOS(xdrs, pos) \
+ (*(xdrs)->x_ops->x_setpostn)(xdrs, pos)
+#define xdr_setpos(xdrs, pos) \
+ (*(xdrs)->x_ops->x_setpostn)(xdrs, pos)
+
+#define XDR_INLINE(xdrs, len) \
+ (*(xdrs)->x_ops->x_inline)(xdrs, len)
+#define xdr_inline(xdrs, len) \
+ (*(xdrs)->x_ops->x_inline)(xdrs, len)
+
+#define XDR_DESTROY(xdrs) \
+ (*(xdrs)->x_ops->x_destroy)(xdrs)
+#define xdr_destroy(xdrs) \
+ (*(xdrs)->x_ops->x_destroy)(xdrs)
+
+#define XDR_CONTROL(xdrs, req, op) \
+ (*(xdrs)->x_ops->x_control)(xdrs, req, op)
+#define xdr_control(xdrs, req, op) \
+ (*(xdrs)->x_ops->x_control)(xdrs, req, op)
+
+/*
+ * Support struct for discriminated unions.
+ * You create an array of xdrdiscrim structures, terminated with
+ * a entry with a null procedure pointer. The xdr_union routine gets
+ * the discriminant value and then searches the array of structures
+ * for a matching value. If a match is found the associated xdr routine
+ * is called to handle that part of the union. If there is
+ * no match, then a default routine may be called.
+ * If there is no match and no default routine it is an error.
+ */
+
+
+/*
+ * A xdrproc_t exists for each data type which is to be encoded or decoded.
+ *
+ * The second argument to the xdrproc_t is a pointer to an opaque pointer.
+ * The opaque pointer generally points to a structure of the data type
+ * to be decoded. If this pointer is 0, then the type routines should
+ * allocate dynamic storage of the appropriate size and return it.
+ * bool_t (*xdrproc_t)(XDR *, void *);
+ */
+#ifdef __cplusplus
+typedef bool_t (*xdrproc_t)(XDR *, void *);
+#else
+#ifdef __STDC__
+typedef bool_t (*xdrproc_t)(); /* For Backward compatibility */
+#else
+typedef bool_t (*xdrproc_t)();
+#endif
+#endif
+
+#define NULL_xdrproc_t ((xdrproc_t)0)
+
+#if defined(_LP64) || defined(_I32LPx)
+#define xdr_rpcvers(xdrs, versp) xdr_u_int(xdrs, versp)
+#define xdr_rpcprog(xdrs, progp) xdr_u_int(xdrs, progp)
+#define xdr_rpcproc(xdrs, procp) xdr_u_int(xdrs, procp)
+#define xdr_rpcprot(xdrs, protp) xdr_u_int(xdrs, protp)
+#define xdr_rpcport(xdrs, portp) xdr_u_int(xdrs, portp)
+#else
+#define xdr_rpcvers(xdrs, versp) xdr_u_long(xdrs, versp)
+#define xdr_rpcprog(xdrs, progp) xdr_u_long(xdrs, progp)
+#define xdr_rpcproc(xdrs, procp) xdr_u_long(xdrs, procp)
+#define xdr_rpcprot(xdrs, protp) xdr_u_long(xdrs, protp)
+#define xdr_rpcport(xdrs, portp) xdr_u_long(xdrs, portp)
+#endif
+
+struct xdr_discrim {
+ int value;
+ xdrproc_t proc;
+};
+
+/*
+ * In-line routines for fast encode/decode of primitve data types.
+ * Caveat emptor: these use single memory cycles to get the
+ * data from the underlying buffer, and will fail to operate
+ * properly if the data is not aligned. The standard way to use these
+ * is to say:
+ * if ((buf = XDR_INLINE(xdrs, count)) == NULL)
+ * return (FALSE);
+ * <<< macro calls >>>
+ * where ``count'' is the number of bytes of data occupied
+ * by the primitive data types.
+ *
+ * N.B. and frozen for all time: each data type here uses 4 bytes
+ * of external representation.
+ */
+
+#define IXDR_GET_INT32(buf) ((int32_t)ntohl((uint32_t)*(buf)++))
+#define IXDR_PUT_INT32(buf, v) (*(buf)++ = (int32_t)htonl((uint32_t)v))
+#define IXDR_GET_U_INT32(buf) ((uint32_t)IXDR_GET_INT32(buf))
+#define IXDR_PUT_U_INT32(buf, v) IXDR_PUT_INT32((buf), ((int32_t)(v)))
+
+#if !defined(_KERNEL) && !defined(_LP64)
+
+#define IXDR_GET_LONG(buf) ((long)ntohl((ulong_t)*(buf)++))
+#define IXDR_PUT_LONG(buf, v) (*(buf)++ = (long)htonl((ulong_t)v))
+#define IXDR_GET_U_LONG(buf) ((ulong_t)IXDR_GET_LONG(buf))
+#define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
+
+#define IXDR_GET_BOOL(buf) ((bool_t)IXDR_GET_LONG(buf))
+#define IXDR_GET_ENUM(buf, t) ((t)IXDR_GET_LONG(buf))
+#define IXDR_GET_SHORT(buf) ((short)IXDR_GET_LONG(buf))
+#define IXDR_GET_U_SHORT(buf) ((ushort_t)IXDR_GET_LONG(buf))
+
+#define IXDR_PUT_BOOL(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
+#define IXDR_PUT_ENUM(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
+#define IXDR_PUT_SHORT(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
+#define IXDR_PUT_U_SHORT(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
+
+#else
+
+#define IXDR_GET_BOOL(buf) ((bool_t)IXDR_GET_INT32(buf))
+#define IXDR_GET_ENUM(buf, t) ((t)IXDR_GET_INT32(buf))
+#define IXDR_GET_SHORT(buf) ((short)IXDR_GET_INT32(buf))
+#define IXDR_GET_U_SHORT(buf) ((ushort_t)IXDR_GET_INT32(buf))
+
+#define IXDR_PUT_BOOL(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
+#define IXDR_PUT_ENUM(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
+#define IXDR_PUT_SHORT(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
+#define IXDR_PUT_U_SHORT(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
+
+#endif
+
+#ifndef _LITTLE_ENDIAN
+#define IXDR_GET_HYPER(buf, v) { \
+ *((int32_t *)(&v)) = ntohl(*(uint32_t *)buf++); \
+ *((int32_t *)(((char *)&v) + BYTES_PER_XDR_UNIT)) \
+ = ntohl(*(uint32_t *)buf++); \
+ }
+#define IXDR_PUT_HYPER(buf, v) { \
+ *(buf)++ = (int32_t)htonl(*(uint32_t *) \
+ ((char *)&v)); \
+ *(buf)++ = \
+ (int32_t)htonl(*(uint32_t *)(((char *)&v) \
+ + BYTES_PER_XDR_UNIT)); \
+ }
+#else
+
+#define IXDR_GET_HYPER(buf, v) { \
+ *((int32_t *)(((char *)&v) + \
+ BYTES_PER_XDR_UNIT)) \
+ = ntohl(*(uint32_t *)buf++); \
+ *((int32_t *)(&v)) = \
+ ntohl(*(uint32_t *)buf++); \
+ }
+
+#define IXDR_PUT_HYPER(buf, v) { \
+ *(buf)++ = \
+ (int32_t)htonl(*(uint32_t *)(((char *)&v) + \
+ BYTES_PER_XDR_UNIT)); \
+ *(buf)++ = \
+ (int32_t)htonl(*(uint32_t *)((char *)&v)); \
+ }
+#endif
+#define IXDR_GET_U_HYPER(buf, v) IXDR_GET_HYPER(buf, v)
+#define IXDR_PUT_U_HYPER(buf, v) IXDR_PUT_HYPER(buf, v)
+
+
+/*
+ * These are the "generic" xdr routines.
+ */
+#ifdef __STDC__
+extern bool_t xdr_void(void);
+extern bool_t xdr_int(XDR *, int *);
+extern bool_t xdr_u_int(XDR *, uint_t *);
+extern bool_t xdr_long(XDR *, long *);
+extern bool_t xdr_u_long(XDR *, ulong_t *);
+extern bool_t xdr_short(XDR *, short *);
+extern bool_t xdr_u_short(XDR *, ushort_t *);
+extern bool_t xdr_bool(XDR *, bool_t *);
+extern bool_t xdr_enum(XDR *, enum_t *);
+extern bool_t xdr_array(XDR *, caddr_t *, uint_t *, const uint_t,
+ const uint_t, const xdrproc_t);
+extern bool_t xdr_bytes(XDR *, char **, uint_t *, const uint_t);
+extern bool_t xdr_opaque(XDR *, caddr_t, const uint_t);
+extern bool_t xdr_string(XDR *, char **, const uint_t);
+extern bool_t xdr_union(XDR *, enum_t *, char *,
+ const struct xdr_discrim *, const xdrproc_t);
+extern unsigned int xdr_sizeof(xdrproc_t, void *);
+
+extern bool_t xdr_hyper(XDR *, longlong_t *);
+extern bool_t xdr_longlong_t(XDR *, longlong_t *);
+extern bool_t xdr_u_hyper(XDR *, u_longlong_t *);
+extern bool_t xdr_u_longlong_t(XDR *, u_longlong_t *);
+
+extern bool_t xdr_char(XDR *, char *);
+extern bool_t xdr_wrapstring(XDR *, char **);
+extern bool_t xdr_reference(XDR *, caddr_t *, uint_t, const xdrproc_t);
+extern bool_t xdr_pointer(XDR *, char **, uint_t, const xdrproc_t);
+extern void xdr_free(xdrproc_t, char *);
+extern bool_t xdr_time_t(XDR *, time_t *);
+
+extern bool_t xdr_int8_t(XDR *, int8_t *);
+extern bool_t xdr_uint8_t(XDR *, uint8_t *);
+extern bool_t xdr_int16_t(XDR *, int16_t *);
+extern bool_t xdr_uint16_t(XDR *, uint16_t *);
+extern bool_t xdr_int32_t(XDR *, int32_t *);
+extern bool_t xdr_uint32_t(XDR *, uint32_t *);
+#if defined(_INT64_TYPE)
+extern bool_t xdr_int64_t(XDR *, int64_t *);
+extern bool_t xdr_uint64_t(XDR *, uint64_t *);
+#endif
+
+#ifndef _KERNEL
+extern bool_t xdr_u_char(XDR *, uchar_t *);
+extern bool_t xdr_vector(XDR *, char *, const uint_t, const uint_t, const
+xdrproc_t);
+extern bool_t xdr_float(XDR *, float *);
+extern bool_t xdr_double(XDR *, double *);
+extern bool_t xdr_quadruple(XDR *, long double *);
+#endif /* !_KERNEL */
+#else
+extern bool_t xdr_void();
+extern bool_t xdr_int();
+extern bool_t xdr_u_int();
+extern bool_t xdr_long();
+extern bool_t xdr_u_long();
+extern bool_t xdr_short();
+extern bool_t xdr_u_short();
+extern bool_t xdr_bool();
+extern bool_t xdr_enum();
+extern bool_t xdr_array();
+extern bool_t xdr_bytes();
+extern bool_t xdr_opaque();
+extern bool_t xdr_string();
+extern bool_t xdr_union();
+
+extern bool_t xdr_hyper();
+extern bool_t xdr_longlong_t();
+extern bool_t xdr_u_hyper();
+extern bool_t xdr_u_longlong_t();
+extern bool_t xdr_char();
+extern bool_t xdr_reference();
+extern bool_t xdr_pointer();
+extern void xdr_free();
+extern bool_t xdr_wrapstring();
+extern bool_t xdr_time_t();
+
+extern bool_t xdr_int8_t();
+extern bool_t xdr_uint8_t();
+extern bool_t xdr_int16_t();
+extern bool_t xdr_uint16_t();
+extern bool_t xdr_int32_t();
+extern bool_t xdr_uint32_t();
+#if defined(_INT64_TYPE)
+extern bool_t xdr_int64_t();
+extern bool_t xdr_uint64_t();
+#endif
+
+#ifndef _KERNEL
+extern bool_t xdr_u_char();
+extern bool_t xdr_vector();
+extern bool_t xdr_float();
+extern bool_t xdr_double();
+extern bool_t xdr_quadruple();
+#endif /* !_KERNEL */
+#endif
+
+/*
+ * Common opaque bytes objects used by many rpc protocols;
+ * declared here due to commonality.
+ */
+#define MAX_NETOBJ_SZ 1024
+struct netobj {
+ uint_t n_len;
+ char *n_bytes;
+};
+typedef struct netobj netobj;
+
+#ifdef __STDC__
+extern bool_t xdr_netobj(XDR *, netobj *);
+#else
+extern bool_t xdr_netobj();
+#endif
+
+/*
+ * These are XDR control operators
+ */
+
+#define XDR_GET_BYTES_AVAIL 1
+
+struct xdr_bytesrec {
+ bool_t xc_is_last_record;
+ size_t xc_num_avail;
+};
+
+typedef struct xdr_bytesrec xdr_bytesrec;
+
+/*
+ * These are the request arguments to XDR_CONTROL.
+ *
+ * XDR_PEEK - returns the contents of the next XDR unit on the XDR stream.
+ * XDR_SKIPBYTES - skips the next N bytes in the XDR stream.
+ * XDR_RDMAGET - for xdr implementation over RDMA, gets private flags from
+ * the XDR stream being moved over RDMA
+ * XDR_RDMANOCHUNK - for xdr implementaion over RDMA, sets private flags in
+ * the XDR stream moving over RDMA.
+ */
+#ifdef _KERNEL
+#define XDR_PEEK 2
+#define XDR_SKIPBYTES 3
+#define XDR_RDMAGET 4
+#define XDR_RDMASET 5
+#endif
+
+/*
+ * These are the public routines for the various implementations of
+ * xdr streams.
+ */
+#ifndef _KERNEL
+#ifdef __STDC__
+extern void xdrmem_create(XDR *, const caddr_t, const uint_t, const enum
+xdr_op);
+ /* XDR using memory buffers */
+extern void xdrrec_create(XDR *, const uint_t, const uint_t, const caddr_t,
+int (*) (void *, caddr_t, int), int (*) (void *, caddr_t, int));
+/* XDR pseudo records for tcp */
+extern bool_t xdrrec_endofrecord(XDR *, bool_t);
+/* make end of xdr record */
+extern bool_t xdrrec_skiprecord(XDR *);
+/* move to beginning of next record */
+extern bool_t xdrrec_eof(XDR *);
+extern uint_t xdrrec_readbytes(XDR *, caddr_t, uint_t);
+/* true if no more input */
+#else
+extern void xdrmem_create();
+extern void xdrstdio_create();
+extern void xdrrec_create();
+extern bool_t xdrrec_endofrecord();
+extern bool_t xdrrec_skiprecord();
+extern bool_t xdrrec_eof();
+extern uint_t xdrrec_readbytes();
+#endif
+#else
+
+extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);
+
+extern struct xdr_ops xdrmblk_ops;
+
+struct rpc_msg;
+extern bool_t xdr_callmsg(XDR *, struct rpc_msg *);
+extern bool_t xdr_replymsg_body(XDR *, struct rpc_msg *);
+extern bool_t xdr_replymsg_hdr(XDR *, struct rpc_msg *);
+
+#include <sys/malloc.h>
+#ifdef mem_alloc
+#undef mem_alloc
+#define mem_alloc(size) malloc((size), M_TEMP, M_WAITOK | M_ZERO)
+#endif
+#ifdef mem_free
+#undef mem_free
+#define mem_free(ptr, size) free((ptr), M_TEMP)
+#endif
+
+#endif /* !_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_RPC_XDR_H */
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c b/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c
new file mode 100644
index 000000000000..3711e53d14c1
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c
@@ -0,0 +1,123 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * xdr_array.c, Generic XDR routines impelmentation.
+ * These are the "non-trivial" xdr primitives used to serialize and de-serialize
+ * arrays. See xdr.h for more info on the interface to xdr.
+ */
+
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <rpc/types.h>
+#include <rpc/xdr.h>
+
+#define LASTUNSIGNED ((uint_t)0-1)
+
+/*
+ * XDR an array of arbitrary elements
+ * *addrp is a pointer to the array, *sizep is the number of elements.
+ * If addrp is NULL (*sizep * elsize) bytes are allocated.
+ * elsize is the size (in bytes) of each element, and elproc is the
+ * xdr procedure to call to handle each element of the array.
+ */
+bool_t
+xdr_array(XDR *xdrs, caddr_t *addrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i;
+ caddr_t target = *addrp;
+ uint_t c; /* the actual element count */
+ bool_t stat = TRUE;
+ uint_t nodesize;
+
+ /* like strings, arrays are really counted arrays */
+ if (!xdr_u_int(xdrs, sizep)) {
+#ifdef DEBUG
+ printf("xdr_array: size FAILED\n");
+#endif
+ return (FALSE);
+ }
+ c = *sizep;
+ if ((c > maxsize || LASTUNSIGNED / elsize < c) &&
+ xdrs->x_op != XDR_FREE) {
+#ifdef DEBUG
+ printf("xdr_array: bad size FAILED\n");
+#endif
+ return (FALSE);
+ }
+ nodesize = c * elsize;
+
+ /*
+ * if we are deserializing, we may need to allocate an array.
+ * We also save time by checking for a null array if we are freeing.
+ */
+ if (target == NULL)
+ switch (xdrs->x_op) {
+ case XDR_DECODE:
+ if (c == 0)
+ return (TRUE);
+ *addrp = target = (char *)mem_alloc(nodesize);
+ bzero(target, nodesize);
+ break;
+
+ case XDR_FREE:
+ return (TRUE);
+
+ case XDR_ENCODE:
+ break;
+ }
+
+ /*
+ * now we xdr each element of array
+ */
+ for (i = 0; (i < c) && stat; i++) {
+ stat = (*elproc)(xdrs, target, LASTUNSIGNED);
+ target += elsize;
+ }
+
+ /*
+ * the array may need freeing
+ */
+ if (xdrs->x_op == XDR_FREE) {
+ mem_free(*addrp, nodesize);
+ *addrp = NULL;
+ }
+ return (stat);
+}
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c b/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c
new file mode 100644
index 000000000000..32ff32daaa07
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c
@@ -0,0 +1,209 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * xdr_mem.c, XDR implementation using memory buffers.
+ *
+ * If you have some data to be interpreted as external data representation
+ * or to be converted to external data representation in a memory buffer,
+ * then this is the package for you.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <rpc/types.h>
+#include <rpc/xdr.h>
+
+static struct xdr_ops *xdrmem_ops(void);
+
+/*
+ * The procedure xdrmem_create initializes a stream descriptor for a
+ * memory buffer.
+ */
+void
+xdrmem_create(XDR *xdrs, caddr_t addr, uint_t size, enum xdr_op op)
+{
+ xdrs->x_op = op;
+ xdrs->x_ops = xdrmem_ops();
+ xdrs->x_private = xdrs->x_base = addr;
+ xdrs->x_handy = size;
+ xdrs->x_public = NULL;
+}
+
+/* ARGSUSED */
+static void
+xdrmem_destroy(XDR *xdrs)
+{
+}
+
+static bool_t
+xdrmem_getint32(XDR *xdrs, int32_t *int32p)
+{
+ if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
+ return (FALSE);
+ /* LINTED pointer alignment */
+ *int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrs->x_private))));
+ xdrs->x_private += sizeof (int32_t);
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_putint32(XDR *xdrs, int32_t *int32p)
+{
+ if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
+ return (FALSE);
+ /* LINTED pointer alignment */
+ *(int32_t *)xdrs->x_private = (int32_t)htonl((uint32_t)(*int32p));
+ xdrs->x_private += sizeof (int32_t);
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_getbytes(XDR *xdrs, caddr_t addr, int len)
+{
+ if ((xdrs->x_handy -= len) < 0)
+ return (FALSE);
+ bcopy(xdrs->x_private, addr, len);
+ xdrs->x_private += len;
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_putbytes(XDR *xdrs, caddr_t addr, int len)
+{
+ if ((xdrs->x_handy -= len) < 0)
+ return (FALSE);
+ bcopy(addr, xdrs->x_private, len);
+ xdrs->x_private += len;
+ return (TRUE);
+}
+
+static uint_t
+xdrmem_getpos(XDR *xdrs)
+{
+ return ((uint_t)((uintptr_t)xdrs->x_private - (uintptr_t)xdrs->x_base));
+}
+
+static bool_t
+xdrmem_setpos(XDR *xdrs, uint_t pos)
+{
+ caddr_t newaddr = xdrs->x_base + pos;
+ caddr_t lastaddr = xdrs->x_private + xdrs->x_handy;
+ ptrdiff_t diff;
+
+ if (newaddr > lastaddr)
+ return (FALSE);
+ xdrs->x_private = newaddr;
+ diff = lastaddr - newaddr;
+ xdrs->x_handy = (int)diff;
+ return (TRUE);
+}
+
+static rpc_inline_t *
+xdrmem_inline(XDR *xdrs, int len)
+{
+ rpc_inline_t *buf = NULL;
+
+ if (xdrs->x_handy >= len) {
+ xdrs->x_handy -= len;
+ /* LINTED pointer alignment */
+ buf = (rpc_inline_t *)xdrs->x_private;
+ xdrs->x_private += len;
+ }
+ return (buf);
+}
+
+static bool_t
+xdrmem_control(XDR *xdrs, int request, void *info)
+{
+ xdr_bytesrec *xptr;
+ int32_t *int32p;
+ int len;
+
+ switch (request) {
+
+ case XDR_GET_BYTES_AVAIL:
+ xptr = (xdr_bytesrec *)info;
+ xptr->xc_is_last_record = TRUE;
+ xptr->xc_num_avail = xdrs->x_handy;
+ return (TRUE);
+
+ case XDR_PEEK:
+ /*
+ * Return the next 4 byte unit in the XDR stream.
+ */
+ if (xdrs->x_handy < sizeof (int32_t))
+ return (FALSE);
+ int32p = (int32_t *)info;
+ *int32p = (int32_t)ntohl((uint32_t)
+ (*((int32_t *)(xdrs->x_private))));
+ return (TRUE);
+
+ case XDR_SKIPBYTES:
+ /*
+ * Skip the next N bytes in the XDR stream.
+ */
+ int32p = (int32_t *)info;
+ len = RNDUP((int)(*int32p));
+ if ((xdrs->x_handy -= len) < 0)
+ return (FALSE);
+ xdrs->x_private += len;
+ return (TRUE);
+
+ }
+ return (FALSE);
+}
+
+static struct xdr_ops *
+xdrmem_ops(void)
+{
+ static struct xdr_ops ops;
+
+ if (ops.x_getint32 == NULL) {
+ ops.x_getbytes = xdrmem_getbytes;
+ ops.x_putbytes = xdrmem_putbytes;
+ ops.x_getpostn = xdrmem_getpos;
+ ops.x_setpostn = xdrmem_setpos;
+ ops.x_inline = xdrmem_inline;
+ ops.x_destroy = xdrmem_destroy;
+ ops.x_control = xdrmem_control;
+ ops.x_getint32 = xdrmem_getint32;
+ ops.x_putint32 = xdrmem_putint32;
+ }
+ return (&ops);
+}
diff --git a/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h b/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h
new file mode 100644
index 000000000000..66481a69f2f1
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h
@@ -0,0 +1,304 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_ASM_LINKAGE_H
+#define _IA32_SYS_ASM_LINKAGE_H
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _ASM /* The remainder of this file is only for assembly files */
+
+/*
+ * make annoying differences in assembler syntax go away
+ */
+
+/*
+ * D16 and A16 are used to insert instructions prefixes; the
+ * macros help the assembler code be slightly more portable.
+ */
+#if !defined(__GNUC_AS__)
+/*
+ * /usr/ccs/bin/as prefixes are parsed as separate instructions
+ */
+#define D16 data16;
+#define A16 addr16;
+
+/*
+ * (There are some weird constructs in constant expressions)
+ */
+#define _CONST(const) [const]
+#define _BITNOT(const) -1!_CONST(const)
+#define _MUL(a, b) _CONST(a \* b)
+
+#else
+/*
+ * Why not use the 'data16' and 'addr16' prefixes .. well, the
+ * assembler doesn't quite believe in real mode, and thus argues with
+ * us about what we're trying to do.
+ */
+#define D16 .byte 0x66;
+#define A16 .byte 0x67;
+
+#define _CONST(const) (const)
+#define _BITNOT(const) ~_CONST(const)
+#define _MUL(a, b) _CONST(a * b)
+
+#endif
+
+/*
+ * C pointers are different sizes between i386 and amd64.
+ * These constants can be used to compute offsets into pointer arrays.
+ */
+#if defined(__amd64)
+#define CLONGSHIFT 3
+#define CLONGSIZE 8
+#define CLONGMASK 7
+#elif defined(__i386)
+#define CLONGSHIFT 2
+#define CLONGSIZE 4
+#define CLONGMASK 3
+#endif
+
+/*
+ * Since we know we're either ILP32 or LP64 ..
+ */
+#define CPTRSHIFT CLONGSHIFT
+#define CPTRSIZE CLONGSIZE
+#define CPTRMASK CLONGMASK
+
+#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
+#error "inconsistent shift constants"
+#endif
+
+#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
+#error "inconsistent mask constants"
+#endif
+
+#define ASM_ENTRY_ALIGN 16
+
+/*
+ * SSE register alignment and save areas
+ */
+
+#define XMM_SIZE 16
+#define XMM_ALIGN 16
+
+#if defined(__amd64)
+
+#define SAVE_XMM_PROLOG(sreg, nreg) \
+ subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \
+ movq %rsp, sreg
+
+#define RSTOR_XMM_EPILOG(sreg, nreg) \
+ addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp
+
+#elif defined(__i386)
+
+#define SAVE_XMM_PROLOG(sreg, nreg) \
+ subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \
+ movl %esp, sreg; \
+ addl $XMM_ALIGN, sreg; \
+ andl $_BITNOT(XMM_ALIGN-1), sreg
+
+#define RSTOR_XMM_EPILOG(sreg, nreg) \
+ addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp;
+
+#endif /* __i386 */
+
+/*
+ * profiling causes definitions of the MCOUNT and RTMCOUNT
+ * particular to the type
+ */
+#ifdef GPROF
+
+#define MCOUNT(x) \
+ pushl %ebp; \
+ movl %esp, %ebp; \
+ call _mcount; \
+ popl %ebp
+
+#endif /* GPROF */
+
+#ifdef PROF
+
+#define MCOUNT(x) \
+/* CSTYLED */ \
+ .lcomm .L_/**/x/**/1, 4, 4; \
+ pushl %ebp; \
+ movl %esp, %ebp; \
+/* CSTYLED */ \
+ movl $.L_/**/x/**/1, %edx; \
+ call _mcount; \
+ popl %ebp
+
+#endif /* PROF */
+
+/*
+ * if we are not profiling, MCOUNT should be defined to nothing
+ */
+#if !defined(PROF) && !defined(GPROF)
+#define MCOUNT(x)
+#endif /* !defined(PROF) && !defined(GPROF) */
+
+#define RTMCOUNT(x) MCOUNT(x)
+
+/*
+ * Macro to define weak symbol aliases. These are similar to the ANSI-C
+ * #pragma weak name = _name
+ * except a compiler can determine type. The assembler must be told. Hence,
+ * the second parameter must be the type of the symbol (i.e.: function,...)
+ */
+#define ANSI_PRAGMA_WEAK(sym, stype) \
+ .weak sym; \
+ .type sym, @stype; \
+/* CSTYLED */ \
+sym = _/**/sym
+
+/*
+ * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in:
+ * #pragma weak sym1 = sym2
+ */
+#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \
+ .weak sym1; \
+ .type sym1, @stype; \
+sym1 = sym2
+
+/*
+ * ENTRY provides the standard procedure entry code and an easy way to
+ * insert the calls to mcount for profiling. ENTRY_NP is identical, but
+ * never calls mcount.
+ */
+#define ENTRY(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x: MCOUNT(x)
+
+#define ENTRY_NP(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x:
+
+#define RTENTRY(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x: RTMCOUNT(x)
+
+/*
+ * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
+ */
+#define ENTRY2(x, y) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x, y; \
+ .type x, @function; \
+ .type y, @function; \
+/* CSTYLED */ \
+x: ; \
+y: MCOUNT(x)
+
+#define ENTRY_NP2(x, y) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x, y; \
+ .type x, @function; \
+ .type y, @function; \
+/* CSTYLED */ \
+x: ; \
+y:
+
+
+/*
+ * ALTENTRY provides for additional entry points.
+ */
+#define ALTENTRY(x) \
+ .globl x; \
+ .type x, @function; \
+x:
+
+/*
+ * DGDEF and DGDEF2 provide global data declarations.
+ *
+ * DGDEF provides a word aligned word of storage.
+ *
+ * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This
+ * implies this macro is best used for byte arrays.
+ *
+ * DGDEF3 allocates "sz" bytes of storage with "algn" alignment.
+ */
+#define DGDEF2(name, sz) \
+ .data; \
+ .globl name; \
+ .type name, @object; \
+ .size name, sz; \
+name:
+
+#define DGDEF3(name, sz, algn) \
+ .data; \
+ .align algn; \
+ .globl name; \
+ .type name, @object; \
+ .size name, sz; \
+name:
+
+#define DGDEF(name) DGDEF3(name, 4, 4)
+
+/*
+ * SET_SIZE trails a function and set the size for the ELF symbol table.
+ */
+#define SET_SIZE(x) \
+ .size x, [.-x]
+
+/*
+ * NWORD provides native word value.
+ */
+#if defined(__amd64)
+
+/*CSTYLED*/
+#define NWORD quad
+
+#elif defined(__i386)
+
+#define NWORD long
+
+#endif /* __i386 */
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/atomic.h b/sys/contrib/opensolaris/uts/common/sys/atomic.h
new file mode 100644
index 000000000000..de968c598a78
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/atomic.h
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ATOMIC_H
+#define _SYS_ATOMIC_H
+
+
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_KERNEL) && defined(__GNUC__) && defined(_ASM_INLINES) && \
+ (defined(__i386) || defined(__amd64))
+#include <asm/atomic.h>
+#endif
+
+#if defined(_KERNEL) || defined(__STDC__)
+/*
+ * Increment target.
+ */
+extern void atomic_inc_8(volatile uint8_t *);
+extern void atomic_inc_uchar(volatile uchar_t *);
+extern void atomic_inc_16(volatile uint16_t *);
+extern void atomic_inc_ushort(volatile ushort_t *);
+extern void atomic_inc_32(volatile uint32_t *);
+extern void atomic_inc_uint(volatile uint_t *);
+extern void atomic_inc_ulong(volatile ulong_t *);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern void atomic_inc_64(volatile uint64_t *);
+#endif
+
+/*
+ * Decrement target
+ */
+extern void atomic_dec_8(volatile uint8_t *);
+extern void atomic_dec_uchar(volatile uchar_t *);
+extern void atomic_dec_16(volatile uint16_t *);
+extern void atomic_dec_ushort(volatile ushort_t *);
+extern void atomic_dec_32(volatile uint32_t *);
+extern void atomic_dec_uint(volatile uint_t *);
+extern void atomic_dec_ulong(volatile ulong_t *);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern void atomic_dec_64(volatile uint64_t *);
+#endif
+
+/*
+ * Add delta to target
+ */
+#ifdef __i386__
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern void atomic_add_64(volatile uint64_t *, int64_t);
+#endif
+#endif
+
+/*
+ * logical OR bits with target
+ */
+extern void atomic_or_8(volatile uint8_t *, uint8_t);
+extern void atomic_or_uchar(volatile uchar_t *, uchar_t);
+extern void atomic_or_16(volatile uint16_t *, uint16_t);
+extern void atomic_or_ushort(volatile ushort_t *, ushort_t);
+extern void atomic_or_32(volatile uint32_t *, uint32_t);
+extern void atomic_or_uint(volatile uint_t *, uint_t);
+extern void atomic_or_ulong(volatile ulong_t *, ulong_t);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern void atomic_or_64(volatile uint64_t *, uint64_t);
+#endif
+
+/*
+ * logical AND bits with target
+ */
+extern void atomic_and_8(volatile uint8_t *, uint8_t);
+extern void atomic_and_uchar(volatile uchar_t *, uchar_t);
+extern void atomic_and_16(volatile uint16_t *, uint16_t);
+extern void atomic_and_ushort(volatile ushort_t *, ushort_t);
+extern void atomic_and_32(volatile uint32_t *, uint32_t);
+extern void atomic_and_uint(volatile uint_t *, uint_t);
+extern void atomic_and_ulong(volatile ulong_t *, ulong_t);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern void atomic_and_64(volatile uint64_t *, uint64_t);
+#endif
+
+/*
+ * As above, but return the new value. Note that these _nv() variants are
+ * substantially more expensive on some platforms than the no-return-value
+ * versions above, so don't use them unless you really need to know the
+ * new value *atomically* (e.g. when decrementing a reference count and
+ * checking whether it went to zero).
+ */
+
+/*
+ * Increment target and return new value.
+ */
+extern uint8_t atomic_inc_8_nv(volatile uint8_t *);
+extern uchar_t atomic_inc_uchar_nv(volatile uchar_t *);
+extern uint16_t atomic_inc_16_nv(volatile uint16_t *);
+extern ushort_t atomic_inc_ushort_nv(volatile ushort_t *);
+extern uint32_t atomic_inc_32_nv(volatile uint32_t *);
+extern uint_t atomic_inc_uint_nv(volatile uint_t *);
+extern ulong_t atomic_inc_ulong_nv(volatile ulong_t *);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_inc_64_nv(volatile uint64_t *);
+#endif
+
+/*
+ * Decrement target and return new value.
+ */
+extern uint8_t atomic_dec_8_nv(volatile uint8_t *);
+extern uchar_t atomic_dec_uchar_nv(volatile uchar_t *);
+extern uint16_t atomic_dec_16_nv(volatile uint16_t *);
+extern ushort_t atomic_dec_ushort_nv(volatile ushort_t *);
+extern uint32_t atomic_dec_32_nv(volatile uint32_t *);
+extern uint_t atomic_dec_uint_nv(volatile uint_t *);
+extern ulong_t atomic_dec_ulong_nv(volatile ulong_t *);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_dec_64_nv(volatile uint64_t *);
+#endif
+
+/*
+ * Add delta to target
+ */
+extern uint8_t atomic_add_8_nv(volatile uint8_t *, int8_t);
+extern uchar_t atomic_add_char_nv(volatile uchar_t *, signed char);
+extern uint16_t atomic_add_16_nv(volatile uint16_t *, int16_t);
+extern ushort_t atomic_add_short_nv(volatile ushort_t *, short);
+extern uint32_t atomic_add_32_nv(volatile uint32_t *, int32_t);
+extern uint_t atomic_add_int_nv(volatile uint_t *, int);
+extern void *atomic_add_ptr_nv(volatile void *, ssize_t);
+extern ulong_t atomic_add_long_nv(volatile ulong_t *, long);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t);
+#endif
+
+/*
+ * logical OR bits with target and return new value.
+ */
+extern uint8_t atomic_or_8_nv(volatile uint8_t *, uint8_t);
+extern uchar_t atomic_or_uchar_nv(volatile uchar_t *, uchar_t);
+extern uint16_t atomic_or_16_nv(volatile uint16_t *, uint16_t);
+extern ushort_t atomic_or_ushort_nv(volatile ushort_t *, ushort_t);
+extern uint32_t atomic_or_32_nv(volatile uint32_t *, uint32_t);
+extern uint_t atomic_or_uint_nv(volatile uint_t *, uint_t);
+extern ulong_t atomic_or_ulong_nv(volatile ulong_t *, ulong_t);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_or_64_nv(volatile uint64_t *, uint64_t);
+#endif
+
+/*
+ * logical AND bits with target and return new value.
+ */
+extern uint8_t atomic_and_8_nv(volatile uint8_t *, uint8_t);
+extern uchar_t atomic_and_uchar_nv(volatile uchar_t *, uchar_t);
+extern uint16_t atomic_and_16_nv(volatile uint16_t *, uint16_t);
+extern ushort_t atomic_and_ushort_nv(volatile ushort_t *, ushort_t);
+extern uint32_t atomic_and_32_nv(volatile uint32_t *, uint32_t);
+extern uint_t atomic_and_uint_nv(volatile uint_t *, uint_t);
+extern ulong_t atomic_and_ulong_nv(volatile ulong_t *, ulong_t);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_and_64_nv(volatile uint64_t *, uint64_t);
+#endif
+
+/*
+ * If *arg1 == arg2, set *arg1 = arg3; return old value
+ */
+extern uint8_t atomic_cas_8(volatile uint8_t *, uint8_t, uint8_t);
+extern uchar_t atomic_cas_uchar(volatile uchar_t *, uchar_t, uchar_t);
+extern uint16_t atomic_cas_16(volatile uint16_t *, uint16_t, uint16_t);
+extern ushort_t atomic_cas_ushort(volatile ushort_t *, ushort_t, ushort_t);
+extern uint32_t atomic_cas_32(volatile uint32_t *, uint32_t, uint32_t);
+extern uint_t atomic_cas_uint(volatile uint_t *, uint_t, uint_t);
+extern void *atomic_cas_ptr(volatile void *, void *, void *);
+extern ulong_t atomic_cas_ulong(volatile ulong_t *, ulong_t, ulong_t);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_cas_64(volatile uint64_t *, uint64_t, uint64_t);
+#endif
+
+/*
+ * Swap target and return old value
+ */
+extern uint8_t atomic_swap_8(volatile uint8_t *, uint8_t);
+extern uchar_t atomic_swap_uchar(volatile uchar_t *, uchar_t);
+extern uint16_t atomic_swap_16(volatile uint16_t *, uint16_t);
+extern ushort_t atomic_swap_ushort(volatile ushort_t *, ushort_t);
+extern uint32_t atomic_swap_32(volatile uint32_t *, uint32_t);
+extern uint_t atomic_swap_uint(volatile uint_t *, uint_t);
+extern void *atomic_swap_ptr(volatile void *, void *);
+extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t);
+#if defined(_KERNEL) || defined(_INT64_TYPE)
+extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
+#endif
+
+/*
+ * Perform an exclusive atomic bit set/clear on a target.
+ * Returns 0 if bit was sucessfully set/cleared, or -1
+ * if the bit was already set/cleared.
+ */
+extern int atomic_set_long_excl(volatile ulong_t *, uint_t);
+extern int atomic_clear_long_excl(volatile ulong_t *, uint_t);
+
+/*
+ * Generic memory barrier used during lock entry, placed after the
+ * memory operation that acquires the lock to guarantee that the lock
+ * protects its data. No stores from after the memory barrier will
+ * reach visibility, and no loads from after the barrier will be
+ * resolved, before the lock acquisition reaches global visibility.
+ */
+extern void membar_enter(void);
+
+/*
+ * Generic memory barrier used during lock exit, placed before the
+ * memory operation that releases the lock to guarantee that the lock
+ * protects its data. All loads and stores issued before the barrier
+ * will be resolved before the subsequent lock update reaches visibility.
+ */
+extern void membar_exit(void);
+
+/*
+ * Arrange that all stores issued before this point in the code reach
+ * global visibility before any stores that follow; useful in producer
+ * modules that update a data item, then set a flag that it is available.
+ * The memory barrier guarantees that the available flag is not visible
+ * earlier than the updated data, i.e. it imposes store ordering.
+ */
+extern void membar_producer(void);
+
+/*
+ * Arrange that all loads issued before this point in the code are
+ * completed before any subsequent loads; useful in consumer modules
+ * that check to see if data is available and read the data.
+ * The memory barrier guarantees that the data is not sampled until
+ * after the available flag has been seen, i.e. it imposes load ordering.
+ */
+extern void membar_consumer(void);
+#endif
+
+#if !defined(_KERNEL) && !defined(__STDC__)
+extern void atomic_inc_8();
+extern void atomic_inc_uchar();
+extern void atomic_inc_16();
+extern void atomic_inc_ushort();
+extern void atomic_inc_32();
+extern void atomic_inc_uint();
+extern void atomic_inc_ulong();
+#if defined(_INT64_TYPE)
+extern void atomic_inc_64();
+#endif /* defined(_INT64_TYPE) */
+extern void atomic_dec_8();
+extern void atomic_dec_uchar();
+extern void atomic_dec_16();
+extern void atomic_dec_ushort();
+extern void atomic_dec_32();
+extern void atomic_dec_uint();
+extern void atomic_dec_ulong();
+#if defined(_INT64_TYPE)
+extern void atomic_dec_64();
+#endif /* defined(_INT64_TYPE) */
+extern void atomic_add_8();
+extern void atomic_add_char();
+extern void atomic_add_16();
+extern void atomic_add_short();
+extern void atomic_add_32();
+extern void atomic_add_int();
+extern void atomic_add_ptr();
+extern void atomic_add_long();
+#if defined(_INT64_TYPE)
+extern void atomic_add_64();
+#endif /* defined(_INT64_TYPE) */
+extern void atomic_or_8();
+extern void atomic_or_uchar();
+extern void atomic_or_16();
+extern void atomic_or_ushort();
+extern void atomic_or_32();
+extern void atomic_or_uint();
+extern void atomic_or_ulong();
+#if defined(_INT64_TYPE)
+extern void atomic_or_64();
+#endif /* defined(_INT64_TYPE) */
+extern void atomic_and_8();
+extern void atomic_and_uchar();
+extern void atomic_and_16();
+extern void atomic_and_ushort();
+extern void atomic_and_32();
+extern void atomic_and_uint();
+extern void atomic_and_ulong();
+#if defined(_INT64_TYPE)
+extern void atomic_and_64();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_inc_8_nv();
+extern uchar_t atomic_inc_uchar_nv();
+extern uint16_t atomic_inc_16_nv();
+extern ushort_t atomic_inc_ushort_nv();
+extern uint32_t atomic_inc_32_nv();
+extern uint_t atomic_inc_uint_nv();
+extern ulong_t atomic_inc_ulong_nv();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_inc_64_nv();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_dec_8_nv();
+extern uchar_t atomic_dec_uchar_nv();
+extern uint16_t atomic_dec_16_nv();
+extern ushort_t atomic_dec_ushort_nv();
+extern uint32_t atomic_dec_32_nv();
+extern uint_t atomic_dec_uint_nv();
+extern ulong_t atomic_dec_ulong_nv();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_dec_64_nv();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_add_8_nv();
+extern uchar_t atomic_add_char_nv();
+extern uint16_t atomic_add_16_nv();
+extern ushort_t atomic_add_short_nv();
+extern uint32_t atomic_add_32_nv();
+extern uint_t atomic_add_int_nv();
+extern void *atomic_add_ptr_nv();
+extern ulong_t atomic_add_long_nv();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_add_64_nv();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_or_8_nv();
+extern uchar_t atomic_or_uchar_nv();
+extern uint16_t atomic_or_16_nv();
+extern ushort_t atomic_or_ushort_nv();
+extern uint32_t atomic_or_32_nv();
+extern uint_t atomic_or_uint_nv();
+extern ulong_t atomic_or_ulong_nv();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_or_64_nv();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_and_8_nv();
+extern uchar_t atomic_and_uchar_nv();
+extern uint16_t atomic_and_16_nv();
+extern ushort_t atomic_and_ushort_nv();
+extern uint32_t atomic_and_32_nv();
+extern uint_t atomic_and_uint_nv();
+extern ulong_t atomic_and_ulong_nv();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_and_64_nv();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_cas_8();
+extern uchar_t atomic_cas_uchar();
+extern uint16_t atomic_cas_16();
+extern ushort_t atomic_cas_ushort();
+extern uint32_t atomic_cas_32();
+extern uint_t atomic_cas_uint();
+extern void *atomic_cas_ptr();
+extern ulong_t atomic_cas_ulong();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_cas_64();
+#endif /* defined(_INT64_TYPE) */
+extern uint8_t atomic_swap_8();
+extern uchar_t atomic_swap_uchar();
+extern uint16_t atomic_swap_16();
+extern ushort_t atomic_swap_ushort();
+extern uint32_t atomic_swap_32();
+extern uint_t atomic_swap_uint();
+extern void *atomic_swap_ptr();
+extern ulong_t atomic_swap_ulong();
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_swap_64();
+#endif /* defined(_INT64_TYPE) */
+
+
+extern int atomic_set_long_excl();
+extern int atomic_clear_long_excl();
+
+extern void membar_enter();
+extern void membar_exit();
+extern void membar_producer();
+extern void membar_consumer();
+
+#endif
+
+#if defined(_KERNEL)
+
+#if defined(_LP64) || defined(_ILP32)
+#define atomic_add_ip atomic_add_long
+#define atomic_add_ip_nv atomic_add_long_nv
+#define casip atomic_cas_ulong
+#endif
+
+#if defined(__sparc)
+extern uint8_t ldstub(uint8_t *);
+#endif
+
+/*
+ * Legacy kernel interfaces; they will go away (eventually).
+ */
+extern uint8_t cas8(uint8_t *, uint8_t, uint8_t);
+extern uint32_t cas32(uint32_t *, uint32_t, uint32_t);
+extern uint64_t cas64(uint64_t *, uint64_t, uint64_t);
+extern ulong_t caslong(ulong_t *, ulong_t, ulong_t);
+extern void *casptr(void *, void *, void *);
+extern void atomic_and_long(ulong_t *, ulong_t);
+extern void atomic_or_long(ulong_t *, ulong_t);
+#if defined(__sparc)
+extern uint32_t swapl(uint32_t *, uint32_t);
+#endif
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ATOMIC_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/avl.h b/sys/contrib/opensolaris/uts/common/sys/avl.h
new file mode 100644
index 000000000000..bf9af8948a8f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/avl.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AVL_H
+#define _AVL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This is a private header file. Applications should not directly include
+ * this file.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/avl_impl.h>
+
+/*
+ * This is a generic implemenatation of AVL trees for use in the Solaris kernel.
+ * The interfaces provide an efficient way of implementing an ordered set of
+ * data structures.
+ *
+ * AVL trees provide an alternative to using an ordered linked list. Using AVL
+ * trees will usually be faster, however they requires more storage. An ordered
+ * linked list in general requires 2 pointers in each data structure. The
+ * AVL tree implementation uses 3 pointers. The following chart gives the
+ * approximate performance of operations with the different approaches:
+ *
+ * Operation Link List AVL tree
+ * --------- -------- --------
+ * lookup O(n) O(log(n))
+ *
+ * insert 1 node constant constant
+ *
+ * delete 1 node constant between constant and O(log(n))
+ *
+ * delete all nodes O(n) O(n)
+ *
+ * visit the next
+ * or prev node constant between constant and O(log(n))
+ *
+ *
+ * The data structure nodes are anchored at an "avl_tree_t" (the equivalent
+ * of a list header) and the individual nodes will have a field of
+ * type "avl_node_t" (corresponding to list pointers).
+ *
+ * The type "avl_index_t" is used to indicate a position in the list for
+ * certain calls.
+ *
+ * The usage scenario is generally:
+ *
+ * 1. Create the list/tree with: avl_create()
+ *
+ * followed by any mixture of:
+ *
+ * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert()
+ *
+ * 2b. Visited elements with:
+ * avl_first() - returns the lowest valued node
+ * avl_last() - returns the highest valued node
+ * AVL_NEXT() - given a node go to next higher one
+ * AVL_PREV() - given a node go to previous lower one
+ *
+ * 2c. Find the node with the closest value either less than or greater
+ * than a given value with avl_nearest().
+ *
+ * 2d. Remove individual nodes from the list/tree with avl_remove().
+ *
+ * and finally when the list is being destroyed
+ *
+ * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes.
+ * Note that once you use avl_destroy_nodes(), you can no longer
+ * use any routine except avl_destroy_nodes() and avl_destoy().
+ *
+ * 4. Use avl_destroy() to destroy the AVL tree itself.
+ *
+ * Any locking for multiple thread access is up to the user to provide, just
+ * as is needed for any linked list implementation.
+ */
+
+
+/*
+ * Type used for the root of the AVL tree.
+ */
+typedef struct avl_tree avl_tree_t;
+
+/*
+ * The data nodes in the AVL tree must have a field of this type.
+ */
+typedef struct avl_node avl_node_t;
+
+/*
+ * An opaque type used to locate a position in the tree where a node
+ * would be inserted.
+ */
+typedef uintptr_t avl_index_t;
+
+
+/*
+ * Direction constants used for avl_nearest().
+ */
+#define AVL_BEFORE (0)
+#define AVL_AFTER (1)
+
+
+
+/*
+ * Prototypes
+ *
+ * Where not otherwise mentioned, "void *" arguments are a pointer to the
+ * user data structure which must contain a field of type avl_node_t.
+ *
+ * Also assume the user data structures looks like:
+ * stuct my_type {
+ * ...
+ * avl_node_t my_link;
+ * ...
+ * };
+ */
+
+/*
+ * Initialize an AVL tree. Arguments are:
+ *
+ * tree - the tree to be initialized
+ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
+ * -1 for <, 0 for ==, and +1 for >
+ * size - the value of sizeof(struct my_type)
+ * offset - the value of OFFSETOF(struct my_type, my_link)
+ */
+extern void avl_create(avl_tree_t *tree,
+ int (*compar) (const void *, const void *), size_t size, size_t offset);
+
+
+/*
+ * Find a node with a matching value in the tree. Returns the matching node
+ * found. If not found, it returns NULL and then if "where" is not NULL it sets
+ * "where" for use with avl_insert() or avl_nearest().
+ *
+ * node - node that has the value being looked for
+ * where - position for use with avl_nearest() or avl_insert(), may be NULL
+ */
+extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+
+/*
+ * Insert a node into the tree.
+ *
+ * node - the node to insert
+ * where - position as returned from avl_find()
+ */
+extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where);
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after
+ * or before the data "here".
+ *
+ * This might be usefull for avl clients caching recently accessed
+ * data to avoid doing avl_find() again for insertion.
+ *
+ * new_data - new data to insert
+ * here - existing node in "tree"
+ * direction - either AVL_AFTER or AVL_BEFORE the data "here".
+ */
+extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here,
+ int direction);
+
+
+/*
+ * Return the first or last valued node in the tree. Will return NULL
+ * if the tree is empty.
+ *
+ */
+extern void *avl_first(avl_tree_t *tree);
+extern void *avl_last(avl_tree_t *tree);
+
+
+/*
+ * Return the next or previous valued node in the tree.
+ * AVL_NEXT() will return NULL if at the last node.
+ * AVL_PREV() will return NULL if at the first node.
+ *
+ * node - the node from which the next or previous node is found
+ */
+#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER)
+#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE)
+
+
+/*
+ * Find the node with the nearest value either greater or less than
+ * the value from a previous avl_find(). Returns the node or NULL if
+ * there isn't a matching one.
+ *
+ * where - position as returned from avl_find()
+ * direction - either AVL_BEFORE or AVL_AFTER
+ *
+ * EXAMPLE get the greatest node that is less than a given value:
+ *
+ * avl_tree_t *tree;
+ * struct my_data look_for_value = {....};
+ * struct my_data *node;
+ * struct my_data *less;
+ * avl_index_t where;
+ *
+ * node = avl_find(tree, &look_for_value, &where);
+ * if (node != NULL)
+ * less = AVL_PREV(tree, node);
+ * else
+ * less = avl_nearest(tree, where, AVL_BEFORE);
+ */
+extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction);
+
+
+/*
+ * Add a single node to the tree.
+ * The node must not be in the tree, and it must not
+ * compare equal to any other node already in the tree.
+ *
+ * node - the node to add
+ */
+extern void avl_add(avl_tree_t *tree, void *node);
+
+
+/*
+ * Remove a single node from the tree. The node must be in the tree.
+ *
+ * node - the node to remove
+ */
+extern void avl_remove(avl_tree_t *tree, void *node);
+
+
+/*
+ * Return the number of nodes in the tree
+ */
+extern ulong_t avl_numnodes(avl_tree_t *tree);
+
+
+/*
+ * Used to destroy any remaining nodes in a tree. The cookie argument should
+ * be initialized to NULL before the first call. Returns a node that has been
+ * removed from the tree and may be free()'d. Returns NULL when the tree is
+ * empty.
+ *
+ * Once you call avl_destroy_nodes(), you can only continuing calling it and
+ * finally avl_destroy(). No other AVL routines will be valid.
+ *
+ * cookie - a "void *" used to save state between calls to avl_destroy_nodes()
+ *
+ * EXAMPLE:
+ * avl_tree_t *tree;
+ * struct my_data *node;
+ * void *cookie;
+ *
+ * cookie = NULL;
+ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node);
+ * avl_destroy(tree);
+ */
+extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie);
+
+
+/*
+ * Final destroy of an AVL tree. Arguments are:
+ *
+ * tree - the empty tree to destroy
+ */
+extern void avl_destroy(avl_tree_t *tree);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/avl_impl.h b/sys/contrib/opensolaris/uts/common/sys/avl_impl.h
new file mode 100644
index 000000000000..620685f370d4
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/avl_impl.h
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AVL_IMPL_H
+#define _AVL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This is a private header file. Applications should not directly include
+ * this file.
+ */
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * generic AVL tree implementation for kernel use
+ *
+ * There are 5 pieces of information stored for each node in an AVL tree
+ *
+ * pointer to less than child
+ * pointer to greater than child
+ * a pointer to the parent of this node
+ * an indication [0/1] of which child I am of my parent
+ * a "balance" (-1, 0, +1) indicating which child tree is taller
+ *
+ * Since they only need 3 bits, the last two fields are packed into the
+ * bottom bits of the parent pointer on 64 bit machines to save on space.
+ */
+
+#ifndef _LP64
+
+struct avl_node {
+ struct avl_node *avl_child[2]; /* left/right children */
+ struct avl_node *avl_parent; /* this node's parent */
+ unsigned short avl_child_index; /* my index in parent's avl_child[] */
+ short avl_balance; /* balance value: -1, 0, +1 */
+};
+
+#define AVL_XPARENT(n) ((n)->avl_parent)
+#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p))
+
+#define AVL_XCHILD(n) ((n)->avl_child_index)
+#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c))
+
+#define AVL_XBALANCE(n) ((n)->avl_balance)
+#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b))
+
+#else /* _LP64 */
+
+/*
+ * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index
+ * values packed in the following manner:
+ *
+ * |63 3| 2 |1 0 |
+ * |-------------------------------------|-----------------|-------------|
+ * | avl_parent hi order bits | avl_child_index | avl_balance |
+ * | | | + 1 |
+ * |-------------------------------------|-----------------|-------------|
+ *
+ */
+struct avl_node {
+ struct avl_node *avl_child[2]; /* left/right children nodes */
+ uintptr_t avl_pcb; /* parent, child_index, balance */
+};
+
+/*
+ * macros to extract/set fields in avl_pcb
+ *
+ * pointer to the parent of the current node is the high order bits
+ */
+#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7))
+#define AVL_SETPARENT(n, p) \
+ ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p)))
+
+/*
+ * index of this node in its parent's avl_child[]: bit #2
+ */
+#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1)
+#define AVL_SETCHILD(n, c) \
+ ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2)))
+
+/*
+ * balance indication for a node, lowest 2 bits. A valid balance is
+ * -1, 0, or +1, and is encoded by adding 1 to the value to get the
+ * unsigned values of 0, 1, 2.
+ */
+#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1))
+#define AVL_SETBALANCE(n, b) \
+ ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1))))
+
+#endif /* _LP64 */
+
+
+
+/*
+ * switch between a node and data pointer for a given tree
+ * the value of "o" is tree->avl_offset
+ */
+#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o)))
+#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o)))
+
+
+
+/*
+ * macros used to create/access an avl_index_t
+ */
+#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1))
+#define AVL_INDEX2CHILD(x) ((x) & 1)
+#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c))
+
+
+/*
+ * The tree structure. The fields avl_root, avl_compar, and avl_offset come
+ * first since they are needed for avl_find(). We want them to fit into
+ * a single 64 byte cache line to make avl_find() as fast as possible.
+ */
+struct avl_tree {
+ struct avl_node *avl_root; /* root node in tree */
+ int (*avl_compar)(const void *, const void *);
+ size_t avl_offset; /* offsetof(type, avl_link_t field) */
+ ulong_t avl_numnodes; /* number of nodes in the tree */
+ size_t avl_size; /* sizeof user type struct */
+};
+
+
+/*
+ * This will only by used via AVL_NEXT() or AVL_PREV()
+ */
+extern void *avl_walk(struct avl_tree *, void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVL_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/bitmap.h b/sys/contrib/opensolaris/uts/common/sys/bitmap.h
new file mode 100644
index 000000000000..d0dd12b68318
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/bitmap.h
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ifndef _SYS_BITMAP_H
+#define _SYS_BITMAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/feature_tests.h>
+#if defined(__GNUC__) && defined(_ASM_INLINES) && \
+ (defined(__i386) || defined(__amd64))
+#include <asm/bitmap.h>
+#endif
+
+/*
+ * Operations on bitmaps of arbitrary size
+ * A bitmap is a vector of 1 or more ulong_t's.
+ * The user of the package is responsible for range checks and keeping
+ * track of sizes.
+ */
+
+#ifdef _LP64
+#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */
+#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */
+#else
+#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */
+#endif
+
+#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */
+#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */
+
+#ifdef _LP64
+#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */
+#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */
+#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */
+#else
+#define BT_ULMAXMASK 0xffffffff
+#endif
+
+/*
+ * bitmap is a ulong_t *, bitindex an index_t
+ *
+ * The macros BT_WIM and BT_BIW internal; there is no need
+ * for users of this package to use them.
+ */
+
+/*
+ * word in map
+ */
+#define BT_WIM(bitmap, bitindex) \
+ ((bitmap)[(bitindex) >> BT_ULSHIFT])
+/*
+ * bit in word
+ */
+#define BT_BIW(bitindex) \
+ (1UL << ((bitindex) & BT_ULMASK))
+
+#ifdef _LP64
+#define BT_WIM32(bitmap, bitindex) \
+ ((bitmap)[(bitindex) >> BT_ULSHIFT32])
+
+#define BT_BIW32(bitindex) \
+ (1UL << ((bitindex) & BT_ULMASK32))
+#endif
+
+/*
+ * These are public macros
+ *
+ * BT_BITOUL == n bits to n ulong_t's
+ */
+#define BT_BITOUL(nbits) \
+ (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
+#define BT_SIZEOFMAP(nbits) \
+ (BT_BITOUL(nbits) * sizeof (ulong_t))
+#define BT_TEST(bitmap, bitindex) \
+ ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
+#define BT_SET(bitmap, bitindex) \
+ { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
+#define BT_CLEAR(bitmap, bitindex) \
+ { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }
+
+#ifdef _LP64
+#define BT_BITOUL32(nbits) \
+ (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32)
+#define BT_SIZEOFMAP32(nbits) \
+ (BT_BITOUL32(nbits) * sizeof (uint_t))
+#define BT_TEST32(bitmap, bitindex) \
+ ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0)
+#define BT_SET32(bitmap, bitindex) \
+ { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); }
+#define BT_CLEAR32(bitmap, bitindex) \
+ { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); }
+#endif /* _LP64 */
+
+
+/*
+ * BIT_ONLYONESET is a private macro not designed for bitmaps of
+ * arbitrary size. u must be an unsigned integer/long. It returns
+ * true if one and only one bit is set in u.
+ */
+#define BIT_ONLYONESET(u) \
+ ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0))
+
+#if defined(_KERNEL) && !defined(_ASM)
+#include <sys/atomic.h>
+
+/*
+ * return next available bit index from map with specified number of bits
+ */
+extern index_t bt_availbit(ulong_t *bitmap, size_t nbits);
+/*
+ * find the highest order bit that is on, and is within or below
+ * the word specified by wx
+ */
+extern int bt_gethighbit(ulong_t *mapp, int wx);
+extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2,
+ size_t end_pos);
+/*
+ * Find highest and lowest one bit set.
+ * Returns bit number + 1 of bit that is set, otherwise returns 0.
+ * Low order bit is 0, high order bit is 31.
+ */
+extern int highbit(ulong_t);
+extern int lowbit(ulong_t);
+extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
+extern void bt_copy(ulong_t *, ulong_t *, ulong_t);
+
+/*
+ * find the parity
+ */
+extern int odd_parity(ulong_t);
+
+/*
+ * Atomically set/clear bits
+ * Atomic exclusive operations will set "result" to "-1"
+ * if the bit is already set/cleared. "result" will be set
+ * to 0 otherwise.
+ */
+#define BT_ATOMIC_SET(bitmap, bitindex) \
+ { atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+#define BT_ATOMIC_CLEAR(bitmap, bitindex) \
+ { atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+
+#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
+ { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \
+ (bitindex) % BT_NBIPUL); }
+#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \
+ { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \
+ (bitindex) % BT_NBIPUL); }
+
+/*
+ * Extracts bits between index h (high, inclusive) and l (low, exclusive) from
+ * u, which must be an unsigned integer.
+ */
+#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU))
+
+#endif /* _KERNEL && !_ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BITMAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/byteorder.h b/sys/contrib/opensolaris/uts/common/sys/byteorder.h
new file mode 100644
index 000000000000..b80a0f02bd57
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/byteorder.h
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _SYS_BYTEORDER_H
+#define _SYS_BYTEORDER_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/int_types.h>
+
+#if defined(__GNUC__) && defined(_ASM_INLINES) && \
+ (defined(__i386) || defined(__amd64))
+#include <asm/byteorder.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * macros for conversion between host and (internet) network byte order
+ */
+
+#if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint)
+/* big-endian */
+#define ntohl(x) (x)
+#define ntohs(x) (x)
+#define htonl(x) (x)
+#define htons(x) (x)
+
+#elif !defined(ntohl) /* little-endian */
+
+#ifndef _IN_PORT_T
+#define _IN_PORT_T
+typedef uint16_t in_port_t;
+#endif
+
+#ifndef _IN_ADDR_T
+#define _IN_ADDR_T
+typedef uint32_t in_addr_t;
+#endif
+
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5)
+extern uint32_t htonl(uint32_t);
+extern uint16_t htons(uint16_t);
+extern uint32_t ntohl(uint32_t);
+extern uint16_t ntohs(uint16_t);
+#else
+extern in_addr_t htonl(in_addr_t);
+extern in_port_t htons(in_port_t);
+extern in_addr_t ntohl(in_addr_t);
+extern in_port_t ntohs(in_port_t);
+#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */
+#endif
+
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+
+/*
+ * Macros to reverse byte order
+ */
+#define BSWAP_8(x) ((x) & 0xff)
+#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+
+#define BMASK_8(x) ((x) & 0xff)
+#define BMASK_16(x) ((x) & 0xffff)
+#define BMASK_32(x) ((x) & 0xffffffff)
+#define BMASK_64(x) (x)
+
+/*
+ * Macros to convert from a specific byte order to/from native byte order
+ */
+#ifdef _BIG_ENDIAN
+#define BE_8(x) BMASK_8(x)
+#define BE_16(x) BMASK_16(x)
+#define BE_32(x) BMASK_32(x)
+#define BE_64(x) BMASK_64(x)
+#define LE_8(x) BSWAP_8(x)
+#define LE_16(x) BSWAP_16(x)
+#define LE_32(x) BSWAP_32(x)
+#define LE_64(x) BSWAP_64(x)
+#else
+#define LE_8(x) BMASK_8(x)
+#define LE_16(x) BMASK_16(x)
+#define LE_32(x) BMASK_32(x)
+#define LE_64(x) BMASK_64(x)
+#define BE_8(x) BSWAP_8(x)
+#define BE_16(x) BSWAP_16(x)
+#define BE_32(x) BSWAP_32(x)
+#define BE_64(x) BSWAP_64(x)
+#endif
+
+#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BYTEORDER_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/callb.h b/sys/contrib/opensolaris/uts/common/sys/callb.h
new file mode 100644
index 000000000000..b12b2e205c24
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/callb.h
@@ -0,0 +1,214 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CALLB_H
+#define _SYS_CALLB_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/t_lock.h>
+#include <sys/thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * definitions of callback classes (c_class)
+ *
+ * Callbacks belong in the same class if (1) their callback routines
+ * do the same kind of processing (ideally, using the same callback function)
+ * and (2) they can/should be executed at the same time in a cpr
+ * suspend/resume operation.
+ *
+ * Note: The DAEMON class, in particular, is for stopping kernel threads
+ * and nothing else. The CALLB_* macros below should be used to deal
+ * with kernel threads, and the callback function should be callb_generic_cpr.
+ * Another idiosyncrasy of the DAEMON class is that if a suspend operation
+ * fails, some of the callback functions may be called with the RESUME
+ * code which were never called with SUSPEND. Not a problem currently,
+ * but see bug 4201851.
+ */
+#define CB_CL_CPR_DAEMON 0
+#define CB_CL_CPR_VM 1
+#define CB_CL_CPR_CALLOUT 2
+#define CB_CL_CPR_OBP 3
+#define CB_CL_CPR_FB 4
+#define CB_CL_PANIC 5
+#define CB_CL_CPR_RPC 6
+#define CB_CL_CPR_PROMPRINTF 7
+#define CB_CL_UADMIN 8
+#define CB_CL_CPR_PM 9
+#define CB_CL_HALT 10
+#define CB_CL_CPR_DMA 11
+#define CB_CL_CPR_POST_USER 12
+#define CB_CL_UADMIN_PRE_VFS 13
+#define CB_CL_MDBOOT CB_CL_UADMIN
+#define CB_CL_ENTER_DEBUGGER 14
+#define CB_CL_CPR_POST_KERNEL 15
+#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */
+
+/*
+ * CB_CL_CPR_DAEMON class specific definitions are given below:
+ */
+
+/*
+ * code for CPR callb_execute_class
+ */
+#define CB_CODE_CPR_CHKPT 0
+#define CB_CODE_CPR_RESUME 1
+
+typedef void * callb_id_t;
+/*
+ * Per kernel thread structure for CPR daemon callbacks.
+ * Must be protected by either a existing lock in the daemon or
+ * a new lock created for such a purpose.
+ */
+typedef struct callb_cpr {
+ kmutex_t *cc_lockp; /* lock to protect this struct */
+ char cc_events; /* various events for CPR */
+ callb_id_t cc_id; /* callb id address */
+ kcondvar_t cc_callb_cv; /* cv for callback waiting */
+ kcondvar_t cc_stop_cv; /* cv to checkpoint block */
+} callb_cpr_t;
+
+/*
+ * cc_events definitions
+ */
+#define CALLB_CPR_START 1 /* a checkpoint request's started */
+#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */
+#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */
+
+/*
+ * Used when checking that all kernel threads are stopped.
+ */
+#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */
+#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */
+#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */
+ /* due to pwr mgmt of disks, make -- */
+ /* big enough for worst spinup time */
+
+#ifdef _KERNEL
+/*
+ *
+ * CALLB_CPR_INIT macro is used by kernel threads to add their entry to
+ * the callback table and perform other initialization. It automatically
+ * adds the thread as being in the callback class CB_CL_CPR_DAEMON.
+ *
+ * cp - ptr to the callb_cpr_t structure for this kernel thread
+ *
+ * lockp - pointer to mutex protecting the callb_cpr_t stuct
+ *
+ * func - pointer to the callback function for this kernel thread.
+ * It has the prototype boolean_t <func>(void *arg, int code)
+ * where: arg - ptr to the callb_cpr_t structure
+ * code - not used for this type of callback
+ * returns: B_TRUE if successful; B_FALSE if unsuccessful.
+ *
+ * name - a string giving the name of the kernel thread
+ *
+ * Note: lockp is the lock to protect the callb_cpr_t (cp) structure
+ * later on. No lock held is needed for this initialization.
+ */
+#define CALLB_CPR_INIT(cp, lockp, func, name) { \
+ bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \
+ (cp)->cc_lockp = lockp; \
+ (cp)->cc_id = callb_add(func, (void *)(cp), \
+ CB_CL_CPR_DAEMON, name); \
+ }
+
+#ifndef __lock_lint
+#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp));
+#else
+#define CALLB_CPR_ASSERT(cp)
+#endif
+/*
+ * Some threads (like the idle threads) do not adhere to the callback
+ * protocol and are always considered safe. Such threads must never exit.
+ * They register their presence by calling this macro during their
+ * initialization.
+ *
+ * Args:
+ * t - thread pointer of the client kernel thread
+ * name - a string giving the name of the kernel thread
+ */
+#define CALLB_CPR_INIT_SAFE(t, name) { \
+ (void) callb_add_thread(callb_generic_cpr_safe, \
+ (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \
+ name, t); \
+ }
+/*
+ * The lock to protect cp's content must be held before
+ * calling the following two macros.
+ *
+ * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END
+ * is safe for checkpoint/resume.
+ */
+#define CALLB_CPR_SAFE_BEGIN(cp) { \
+ CALLB_CPR_ASSERT(cp) \
+ (cp)->cc_events |= CALLB_CPR_SAFE; \
+ if ((cp)->cc_events & CALLB_CPR_START) \
+ cv_signal(&(cp)->cc_callb_cv); \
+ }
+#define CALLB_CPR_SAFE_END(cp, lockp) { \
+ CALLB_CPR_ASSERT(cp) \
+ while ((cp)->cc_events & CALLB_CPR_START) \
+ cv_wait(&(cp)->cc_stop_cv, lockp); \
+ (cp)->cc_events &= ~CALLB_CPR_SAFE; \
+ }
+/*
+ * cv_destroy is nop right now but may be needed in the future.
+ */
+#define CALLB_CPR_EXIT(cp) { \
+ CALLB_CPR_ASSERT(cp) \
+ (cp)->cc_events |= CALLB_CPR_SAFE; \
+ if ((cp)->cc_events & CALLB_CPR_START) \
+ cv_signal(&(cp)->cc_callb_cv); \
+ mutex_exit((cp)->cc_lockp); \
+ (void) callb_delete((cp)->cc_id); \
+ cv_destroy(&(cp)->cc_callb_cv); \
+ cv_destroy(&(cp)->cc_stop_cv); \
+ }
+
+extern callb_cpr_t callb_cprinfo_safe;
+extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *);
+extern callb_id_t callb_add_thread(boolean_t (*)(void *, int),
+ void *, int, char *, kthread_id_t);
+extern int callb_delete(callb_id_t);
+extern void callb_execute(callb_id_t, int);
+extern void *callb_execute_class(int, int);
+extern boolean_t callb_generic_cpr(void *, int);
+extern boolean_t callb_generic_cpr_safe(void *, int);
+extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *);
+extern void callb_lock_table(void);
+extern void callb_unlock_table(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CALLB_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/ccompile.h b/sys/contrib/opensolaris/uts/common/sys/ccompile.h
new file mode 100644
index 000000000000..c9857b086575
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/ccompile.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CCOMPILE_H
+#define _SYS_CCOMPILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains definitions designed to enable different compilers
+ * to be used harmoniously on Solaris systems.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Allow for version tests for compiler bugs and features.
+ */
+#if defined(__GNUC__)
+#define __GNUC_VERSION \
+ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define __GNUC_VERSION 0
+#endif
+
+#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__)
+
+/*
+ * analogous to lint's PRINTFLIKEn
+ */
+#define __sun_attr___PRINTFLIKE__(__n) \
+ __attribute__((__format__(printf, __n, (__n)+1)))
+#define __sun_attr___VPRINTFLIKE__(__n) \
+ __attribute__((__format__(printf, __n, 0)))
+
+/*
+ * Handle the kernel printf routines that can take '%b' too
+ */
+#if __GNUC_VERSION < 30402
+/*
+ * XX64 at least this doesn't work correctly yet with 3.4.1 anyway!
+ */
+#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__
+#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__
+#else
+#define __sun_attr___KPRINTFLIKE__(__n) \
+ __attribute__((__format__(cmn_err, __n, (__n)+1)))
+#define __sun_attr___KVPRINTFLIKE__(__n) \
+ __attribute__((__format__(cmn_err, __n, 0)))
+#endif
+
+/*
+ * This one's pretty obvious -- the function never returns
+ */
+#define __sun_attr___noreturn__ __attribute__((__noreturn__))
+
+
+/*
+ * This is an appropriate label for functions that do not
+ * modify their arguments, e.g. strlen()
+ */
+#define __sun_attr___pure__ __attribute__((__pure__))
+
+/*
+ * This is a stronger form of __pure__. Can be used for functions
+ * that do not modify their arguments and don't depend on global
+ * memory.
+ */
+#define __sun_attr___const__ __attribute__((__const__))
+
+/*
+ * structure packing like #pragma pack(1)
+ */
+#define __sun_attr___packed__ __attribute__((__packed__))
+
+#define ___sun_attr_inner(__a) __sun_attr_##__a
+#define __sun_attr__(__a) ___sun_attr_inner __a
+
+#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
+
+#define __sun_attr__(__a)
+
+#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
+
+/*
+ * Shorthand versions for readability
+ */
+
+#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n)))
+#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n)))
+#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n)))
+#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n)))
+#define __NORETURN __sun_attr__((__noreturn__))
+#define __CONST __sun_attr__((__const__))
+#define __PURE __sun_attr__((__pure__))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CCOMPILE_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/compress.h b/sys/contrib/opensolaris/uts/common/sys/compress.h
new file mode 100644
index 000000000000..3d79d9511092
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/compress.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_COMPRESS_H
+#define _SYS_COMPRESS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern size_t compress(void *, void *, size_t);
+extern size_t decompress(void *, void *, size_t, size_t);
+extern uint32_t checksum32(void *, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_COMPRESS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/cred.h b/sys/contrib/opensolaris/uts/common/sys/cred.h
new file mode 100644
index 000000000000..c1400b83d71f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/cred.h
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ifndef _SYS_CRED_H
+#define _SYS_CRED_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The credential is an opaque kernel private data structure defined in
+ * <sys/cred_impl.h>.
+ */
+
+typedef struct cred cred_t;
+
+#ifdef _KERNEL
+
+#define CRED() curthread->t_cred
+
+struct proc; /* cred.h is included in proc.h */
+struct prcred;
+
+struct auditinfo_addr; /* cred.h is included in audit.h */
+
+extern int ngroups_max;
+/*
+ * kcred is used when you need all privileges.
+ */
+extern struct cred *kcred;
+
+extern void cred_init(void);
+extern void crhold(cred_t *);
+extern void crfree(cred_t *);
+extern cred_t *cralloc(void); /* all but ref uninitialized */
+extern cred_t *crget(void); /* initialized */
+extern cred_t *crcopy(cred_t *);
+extern void crcopy_to(cred_t *, cred_t *);
+extern cred_t *crdup(cred_t *);
+extern void crdup_to(cred_t *, cred_t *);
+extern cred_t *crgetcred(void);
+extern void crset(struct proc *, cred_t *);
+extern int groupmember(gid_t, const cred_t *);
+extern int supgroupmember(gid_t, const cred_t *);
+extern int hasprocperm(const cred_t *, const cred_t *);
+extern int prochasprocperm(struct proc *, struct proc *, const cred_t *);
+extern int crcmp(const cred_t *, const cred_t *);
+extern cred_t *zone_kcred(void);
+
+extern uid_t crgetuid(const cred_t *);
+extern uid_t crgetruid(const cred_t *);
+extern uid_t crgetsuid(const cred_t *);
+extern gid_t crgetgid(const cred_t *);
+extern gid_t crgetrgid(const cred_t *);
+extern gid_t crgetsgid(const cred_t *);
+extern zoneid_t crgetzoneid(const cred_t *);
+extern projid_t crgetprojid(const cred_t *);
+
+
+extern const struct auditinfo_addr *crgetauinfo(const cred_t *);
+extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *);
+
+extern uint_t crgetref(const cred_t *);
+
+extern const gid_t *crgetgroups(const cred_t *);
+
+extern int crgetngroups(const cred_t *);
+
+/*
+ * Sets real, effective and/or saved uid/gid;
+ * -1 argument accepted as "no change".
+ */
+extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t);
+extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t);
+
+/*
+ * Sets real, effective and saved uids/gids all to the same
+ * values. Both values must be non-negative and <= MAXUID
+ */
+extern int crsetugid(cred_t *, uid_t, gid_t);
+
+extern int crsetgroups(cred_t *, int, gid_t *);
+
+/*
+ * Private interface for setting zone association of credential.
+ */
+struct zone;
+extern void crsetzone(cred_t *, struct zone *);
+extern struct zone *crgetzone(const cred_t *);
+
+/*
+ * Private interface for setting project id in credential.
+ */
+extern void crsetprojid(cred_t *, projid_t);
+
+/*
+ * Private interface for nfs.
+ */
+extern cred_t *crnetadjust(cred_t *);
+
+/*
+ * Private interface for procfs.
+ */
+extern void cred2prcred(const cred_t *, struct prcred *);
+
+/*
+ * Private interfaces for Rampart Trusted Solaris.
+ */
+struct ts_label_s;
+extern struct ts_label_s *crgetlabel(const cred_t *);
+extern boolean_t crisremote(const cred_t *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRED_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/debug.h b/sys/contrib/opensolaris/uts/common/sys/debug.h
new file mode 100644
index 000000000000..c87c8848bc03
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/debug.h
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ifndef _SYS_DEBUG_H
+#define _SYS_DEBUG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * ASSERT(ex) causes a panic or debugger entry if expression ex is not
+ * true. ASSERT() is included only for debugging, and is a no-op in
+ * production kernels. VERIFY(ex), on the other hand, behaves like
+ * ASSERT and is evaluated on both debug and non-debug kernels.
+ */
+
+#if defined(__STDC__)
+extern int assfail(const char *, const char *, int);
+#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
+#if DEBUG
+#define ASSERT(EX) VERIFY(EX)
+#else
+#define ASSERT(x) ((void)0)
+#endif
+#else /* defined(__STDC__) */
+extern int assfail();
+#define VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
+#if DEBUG
+#define ASSERT(EX) VERIFY(EX)
+#else
+#define ASSERT(x) ((void)0)
+#endif
+#endif /* defined(__STDC__) */
+
+/*
+ * Assertion variants sensitive to the compilation data model
+ */
+#if defined(_LP64)
+#define ASSERT64(x) ASSERT(x)
+#define ASSERT32(x)
+#else
+#define ASSERT64(x)
+#define ASSERT32(x) ASSERT(x)
+#endif
+
+/*
+ * ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
+ * and prints out the values of the left and right hand expressions as part of
+ * the panic message to ease debugging. The three variants imply the type
+ * of their arguments. ASSERT3S() is for signed data types, ASSERT3U() is
+ * for unsigned, and ASSERT3P() is for pointers. The VERIFY3*() macros
+ * have the same relationship as above.
+ */
+extern void assfail3(const char *, uintmax_t, const char *, uintmax_t,
+ const char *, int);
+#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
+ const TYPE __left = (TYPE)(LEFT); \
+ const TYPE __right = (TYPE)(RIGHT); \
+ if (!(__left OP __right)) \
+ assfail3(#LEFT " " #OP " " #RIGHT, \
+ (uintmax_t)__left, #OP, (uintmax_t)__right, \
+ __FILE__, __LINE__); \
+_NOTE(CONSTCOND) } while (0)
+
+#define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t)
+#define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t)
+#define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t)
+#if DEBUG
+#define ASSERT3S(x, y, z) VERIFY3S(x, y, z)
+#define ASSERT3U(x, y, z) VERIFY3U(x, y, z)
+#define ASSERT3P(x, y, z) VERIFY3P(x, y, z)
+#else
+#define ASSERT3S(x, y, z) ((void)0)
+#define ASSERT3U(x, y, z) ((void)0)
+#define ASSERT3P(x, y, z) ((void)0)
+#endif
+
+#ifdef _KERNEL
+
+extern void abort_sequence_enter(char *);
+extern void debug_enter(char *);
+
+#endif /* _KERNEL */
+
+#if defined(DEBUG) && !defined(__sun)
+/* CSTYLED */
+#define STATIC
+#else
+/* CSTYLED */
+#define STATIC static
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DEBUG_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/dkio.h b/sys/contrib/opensolaris/uts/common/sys/dkio.h
new file mode 100644
index 000000000000..b0ddd079933d
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/dkio.h
@@ -0,0 +1,477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DKIO_H
+#define _SYS_DKIO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* SunOS-4.0 5.19 */
+
+#include <sys/dklabel.h> /* Needed for NDKMAP define */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Structures and definitions for disk io control commands
+ */
+
+/*
+ * Structures used as data by ioctl calls.
+ */
+
+#define DK_DEVLEN 16 /* device name max length, including */
+ /* unit # & NULL (ie - "xyc1") */
+
+/*
+ * Used for controller info
+ */
+struct dk_cinfo {
+ char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */
+ ushort_t dki_ctype; /* controller type */
+ ushort_t dki_flags; /* flags */
+ ushort_t dki_cnum; /* controller number */
+ uint_t dki_addr; /* controller address */
+ uint_t dki_space; /* controller bus type */
+ uint_t dki_prio; /* interrupt priority */
+ uint_t dki_vec; /* interrupt vector */
+ char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */
+ uint_t dki_unit; /* unit number */
+ uint_t dki_slave; /* slave number */
+ ushort_t dki_partition; /* partition number */
+ ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */
+};
+
+/*
+ * Controller types
+ */
+#define DKC_UNKNOWN 0
+#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */
+#define DKC_WDC2880 2
+#define DKC_XXX_0 3 /* unassigned */
+#define DKC_XXX_1 4 /* unassigned */
+#define DKC_DSD5215 5
+#define DKC_ACB4000 7
+#define DKC_MD21 8
+#define DKC_XXX_2 9 /* unassigned */
+#define DKC_NCRFLOPPY 10
+#define DKC_SMSFLOPPY 12
+#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */
+#define DKC_INTEL82072 14 /* native floppy chip */
+#define DKC_MD 16 /* meta-disk (virtual-disk) driver */
+#define DKC_INTEL82077 19 /* 82077 floppy disk controller */
+#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */
+#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */
+#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */
+
+/*
+ * Sun reserves up through 1023
+ */
+
+#define DKC_CUSTOMER_BASE 1024
+
+/*
+ * Flags
+ */
+#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */
+#define DKI_MAPTRK 0x02 /* controller does track mapping */
+#define DKI_FMTTRK 0x04 /* formats only full track at a time */
+#define DKI_FMTVOL 0x08 /* formats only full volume at a time */
+#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */
+#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */
+#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */
+
+/*
+ * Used for all partitions
+ */
+struct dk_allmap {
+ struct dk_map dka_map[NDKMAP];
+};
+
+#if defined(_SYSCALL32)
+struct dk_allmap32 {
+ struct dk_map32 dka_map[NDKMAP];
+};
+#endif /* _SYSCALL32 */
+
+/*
+ * Definition of a disk's geometry
+ */
+struct dk_geom {
+ unsigned short dkg_ncyl; /* # of data cylinders */
+ unsigned short dkg_acyl; /* # of alternate cylinders */
+ unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */
+ unsigned short dkg_nhead; /* # of heads */
+ unsigned short dkg_obs1; /* obsolete */
+ unsigned short dkg_nsect; /* # of data sectors per track */
+ unsigned short dkg_intrlv; /* interleave factor */
+ unsigned short dkg_obs2; /* obsolete */
+ unsigned short dkg_obs3; /* obsolete */
+ unsigned short dkg_apc; /* alternates per cyl (SCSI only) */
+ unsigned short dkg_rpm; /* revolutions per minute */
+ unsigned short dkg_pcyl; /* # of physical cylinders */
+ unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */
+ unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */
+ unsigned short dkg_extra[7]; /* for compatible expansion */
+};
+
+/*
+ * These defines are for historic compatibility with old drivers.
+ */
+#define dkg_bhead dkg_obs1 /* used to be head offset */
+#define dkg_gap1 dkg_obs2 /* used to be gap1 */
+#define dkg_gap2 dkg_obs3 /* used to be gap2 */
+
+/*
+ * Disk io control commands
+ * Warning: some other ioctls with the DIOC prefix exist elsewhere.
+ * The Generic DKIOC numbers are from 0 - 50.
+ * The Floppy Driver uses 51 - 100.
+ * The Hard Disk (except SCSI) 101 - 106. (these are obsolete)
+ * The CDROM Driver 151 - 200.
+ * The USCSI ioctl 201 - 250.
+ */
+#define DKIOC (0x04 << 8)
+
+/*
+ * The following ioctls are generic in nature and need to be
+ * suported as appropriate by all disk drivers
+ */
+#define DKIOCGGEOM (DKIOC|1) /* Get geometry */
+#define DKIOCINFO (DKIOC|3) /* Get info */
+#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */
+#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */
+#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */
+
+/*
+ * Disk Cache Controls. These ioctls should be supported by
+ * all disk drivers.
+ *
+ * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl
+ * argument, but it should be passed as NULL to allow for future
+ * reinterpretation. From user-mode, this ioctl request is synchronous.
+ *
+ * When invoked from within the kernel, the arg can be NULL to indicate
+ * a synchronous request or can be the address of a struct dk_callback
+ * to request an asynchronous callback when the flush request is complete.
+ * In this case, the flag to the ioctl must include FKIOCTL and the
+ * dkc_callback field of the pointed to struct must be non-null or the
+ * request is made synchronously.
+ *
+ * In the callback case: if the ioctl returns 0, a callback WILL be performed.
+ * If the ioctl returns non-zero, a callback will NOT be performed.
+ * NOTE: In some cases, the callback may be done BEFORE the ioctl call
+ * returns. The caller's locking strategy should be prepared for this case.
+ */
+#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
+
+struct dk_callback {
+ void (*dkc_callback)(void *dkc_cookie, int error);
+ void *dkc_cookie;
+};
+
+#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */
+ /* enablement status */
+#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */
+
+/*
+ * The following ioctls are used by Sun drivers to communicate
+ * with their associated format routines. Support of these ioctls
+ * is not required of foreign drivers
+ */
+#define DKIOCSGEOM (DKIOC|2) /* Set geometry */
+#define DKIOCSAPART (DKIOC|4) /* Set all partitions */
+#define DKIOCGAPART (DKIOC|5) /* Get all partitions */
+#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */
+#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */
+
+/*
+ * The following ioctl's are removable media support
+ */
+#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */
+#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */
+#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */
+#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */
+
+
+/*
+ * ioctl for hotpluggable devices
+ */
+#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */
+
+/*
+ * Ioctl to force driver to re-read the alternate partition and rebuild
+ * the internal defect map.
+ */
+#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */
+#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */
+
+/*
+ * Used by applications to get disk defect information from IDE
+ * drives.
+ */
+#ifdef _SYSCALL32
+struct defect_header32 {
+ int head;
+ caddr32_t buffer;
+};
+#endif /* _SYSCALL32 */
+
+struct defect_header {
+ int head;
+ caddr_t buffer;
+};
+
+#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */
+
+/*
+ * Used by applications to get partition or slice information
+ */
+#ifdef _SYSCALL32
+struct part_info32 {
+ daddr32_t p_start;
+ int p_length;
+};
+#endif /* _SYSCALL32 */
+
+struct part_info {
+ daddr_t p_start;
+ int p_length;
+};
+
+/* The following ioctls are for Optical Memory Device */
+#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */
+#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */
+
+/*
+ * This state enum is the argument passed to the DKIOCSTATE ioctl.
+ */
+enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE };
+
+#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */
+
+/*
+ * ioctls to read/write mboot info.
+ */
+#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */
+#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */
+
+/*
+ * ioctl to get the device temperature.
+ */
+#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */
+
+/*
+ * Used for providing the temperature.
+ */
+
+struct dk_temperature {
+ uint_t dkt_flags; /* Flags */
+ short dkt_cur_temp; /* Current disk temperature */
+ short dkt_ref_temp; /* reference disk temperature */
+};
+
+#define DKT_BYPASS_PM 0x1
+#define DKT_INVALID_TEMP 0xFFFF
+
+
+/*
+ * Used for Media info or the current profile info
+ */
+struct dk_minfo {
+ uint_t dki_media_type; /* Media type or profile info */
+ uint_t dki_lbsize; /* Logical blocksize of media */
+ diskaddr_t dki_capacity; /* Capacity as # of dki_lbsize blks */
+};
+
+/*
+ * Media types or profiles known
+ */
+#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */
+
+
+/*
+ * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to
+ * maintain compatibility with SFF8090. The following define the
+ * optical media type.
+ */
+#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */
+#define DK_MO_ERASABLE 0x03 /* MO Erasable */
+#define DK_MO_WRITEONCE 0x04 /* MO Write once */
+#define DK_AS_MO 0x05 /* AS MO */
+#define DK_CDROM 0x08 /* CDROM */
+#define DK_CDR 0x09 /* CD-R */
+#define DK_CDRW 0x0A /* CD-RW */
+#define DK_DVDROM 0x10 /* DVD-ROM */
+#define DK_DVDR 0x11 /* DVD-R */
+#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */
+
+/*
+ * Media types for other rewritable magnetic media
+ */
+#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */
+#define DK_FLOPPY 0x10002 /* Floppy media */
+#define DK_ZIP 0x10003 /* IOMEGA ZIP media */
+#define DK_JAZ 0x10004 /* IOMEGA JAZ media */
+
+#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */
+#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */
+
+#define DKIOCPARTITION (DKIOC|9) /* Get partition info */
+
+/*
+ * Ioctls to get/set volume capabilities related to Logical Volume Managers.
+ * They include the ability to get/set capabilities and to issue a read to a
+ * specific underlying device of a replicated device.
+ */
+
+#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */
+#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */
+#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */
+
+typedef uint_t volcapinfo_t;
+
+typedef uint_t volcapset_t;
+
+#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */
+#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */
+
+typedef struct volcap {
+ volcapinfo_t vc_info; /* Capabilities available */
+ volcapset_t vc_set; /* Capabilities set */
+} volcap_t;
+
+#define VOL_SIDENAME 256
+
+typedef struct vol_directed_rd {
+ int vdr_flags;
+ offset_t vdr_offset;
+ size_t vdr_nbytes;
+ size_t vdr_bytesread;
+ void *vdr_data;
+ int vdr_side;
+ char vdr_side_name[VOL_SIDENAME];
+} vol_directed_rd_t;
+
+#define DKV_SIDE_INIT (-1)
+#define DKV_DMR_NEXT_SIDE 0x00000001
+#define DKV_DMR_DONE 0x00000002
+#define DKV_DMR_ERROR 0x00000004
+#define DKV_DMR_SUCCESS 0x00000008
+#define DKV_DMR_SHORT 0x00000010
+
+#ifdef _MULTI_DATAMODEL
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+typedef struct vol_directed_rd32 {
+ int32_t vdr_flags;
+ offset_t vdr_offset; /* 64-bit element on 32-bit alignment */
+ size32_t vdr_nbytes;
+ size32_t vdr_bytesread;
+ caddr32_t vdr_data;
+ int32_t vdr_side;
+ char vdr_side_name[VOL_SIDENAME];
+} vol_directed_rd32_t;
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+#endif /* _MULTI_DATAMODEL */
+
+/*
+ * The ioctl is used to fetch disk's device type, vendor ID,
+ * model number/product ID, firmware revision and serial number together.
+ *
+ * Currently there are two device types - DKD_ATA_TYPE which means the
+ * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE
+ * which means the disk is driven by sd/scsi hba driver.
+ */
+#define DKIOC_GETDISKID (DKIOC|46)
+
+/* These two labels are for dkd_dtype of dk_disk_id_t */
+#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */
+#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */
+
+#define DKD_ATA_MODEL 40 /* model number length */
+#define DKD_ATA_FWVER 8 /* firmware revision length */
+#define DKD_ATA_SERIAL 20 /* serial number length */
+
+#define DKD_SCSI_VENDOR 8 /* vendor ID length */
+#define DKD_SCSI_PRODUCT 16 /* product ID length */
+#define DKD_SCSI_REVLEVEL 4 /* revision level length */
+#define DKD_SCSI_SERIAL 12 /* serial number length */
+
+/*
+ * The argument type for DKIOC_GETDISKID ioctl.
+ */
+typedef struct dk_disk_id {
+ uint_t dkd_dtype;
+ union {
+ struct {
+ char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */
+ char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */
+ char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */
+ } ata_disk_id;
+ struct {
+ char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */
+ char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */
+ char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */
+ char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */
+ } scsi_disk_id;
+ } disk_id;
+} dk_disk_id_t;
+
+/*
+ * The ioctl is used to update the firmware of device.
+ */
+#define DKIOC_UPDATEFW (DKIOC|47)
+
+/* The argument type for DKIOC_UPDATEFW ioctl */
+typedef struct dk_updatefw {
+ caddr_t dku_ptrbuf; /* pointer to firmware buf */
+ uint_t dku_size; /* firmware buf length */
+ uint8_t dku_type; /* firmware update type */
+} dk_updatefw_t;
+
+#ifdef _SYSCALL32
+typedef struct dk_updatefw_32 {
+ caddr32_t dku_ptrbuf; /* pointer to firmware buf */
+ uint_t dku_size; /* firmware buf length */
+ uint8_t dku_type; /* firmware update type */
+} dk_updatefw_32_t;
+#endif /* _SYSCALL32 */
+
+/*
+ * firmware update type - temporary or permanent use
+ */
+#define FW_TYPE_TEMP 0x0 /* temporary use */
+#define FW_TYPE_PERM 0x1 /* permanent use */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DKIO_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/dklabel.h b/sys/contrib/opensolaris/uts/common/sys/dklabel.h
new file mode 100644
index 000000000000..92cb47af01b4
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/dklabel.h
@@ -0,0 +1,268 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1990-2002 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DKLABEL_H
+#define _SYS_DKLABEL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/isa_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Miscellaneous defines
+ */
+#define DKL_MAGIC 0xDABE /* magic number */
+#define FKL_MAGIC 0xff /* magic number for DOS floppies */
+
+#if defined(_SUNOS_VTOC_16)
+#define NDKMAP 16 /* # of logical partitions */
+#define DK_LABEL_LOC 1 /* location of disk label */
+#elif defined(_SUNOS_VTOC_8)
+#define NDKMAP 8 /* # of logical partitions */
+#define DK_LABEL_LOC 0 /* location of disk label */
+#else
+#error "No VTOC format defined."
+#endif
+
+#define LEN_DKL_ASCII 128 /* length of dkl_asciilabel */
+#define LEN_DKL_VVOL 8 /* length of v_volume */
+#define DK_LABEL_SIZE 512 /* size of disk label */
+#define DK_MAX_BLOCKS 0x7fffffff /* max # of blocks handled */
+
+/*
+ * Reserve two cylinders on SCSI disks.
+ * One is for the backup disk label and the other is for the deviceid.
+ *
+ * IPI disks only reserve one cylinder, but they will go away soon.
+ * CDROMs do not reserve any cylinders.
+ */
+#define DK_ACYL 2
+
+/*
+ * Format of a Sun disk label.
+ * Resides in cylinder 0, head 0, sector 0.
+ *
+ * sizeof (struct dk_label) should be 512 (the current sector size),
+ * but should the sector size increase, this structure should remain
+ * at the beginning of the sector.
+ */
+
+/*
+ * partition headers: section 1
+ * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I))
+ */
+struct dk_map {
+ daddr_t dkl_cylno; /* starting cylinder */
+ daddr_t dkl_nblk; /* number of blocks; if == 0, */
+ /* partition is undefined */
+};
+
+/*
+ * partition headers: section 1
+ * Fixed size for on-disk dk_label
+ */
+struct dk_map32 {
+ daddr32_t dkl_cylno; /* starting cylinder */
+ daddr32_t dkl_nblk; /* number of blocks; if == 0, */
+ /* partition is undefined */
+};
+
+/*
+ * partition headers: section 2,
+ * brought over from AT&T SVr4 vtoc structure.
+ */
+struct dk_map2 {
+ uint16_t p_tag; /* ID tag of partition */
+ uint16_t p_flag; /* permission flag */
+};
+
+struct dkl_partition {
+ uint16_t p_tag; /* ID tag of partition */
+ uint16_t p_flag; /* permision flags */
+ daddr32_t p_start; /* start sector no of partition */
+ int32_t p_size; /* # of blocks in partition */
+};
+
+
+/*
+ * VTOC inclusions from AT&T SVr4
+ * Fixed sized types for on-disk VTOC
+ */
+
+struct dk_vtoc {
+#if defined(_SUNOS_VTOC_16)
+ uint32_t v_bootinfo[3]; /* info for mboot (unsupported) */
+ uint32_t v_sanity; /* to verify vtoc sanity */
+ uint32_t v_version; /* layout version */
+ char v_volume[LEN_DKL_VVOL]; /* volume name */
+ uint16_t v_sectorsz; /* sector size in bytes */
+ uint16_t v_nparts; /* number of partitions */
+ uint32_t v_reserved[10]; /* free space */
+ struct dkl_partition v_part[NDKMAP]; /* partition headers */
+ time32_t timestamp[NDKMAP]; /* partition timestamp (unsupported) */
+ char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */
+#elif defined(_SUNOS_VTOC_8)
+ uint32_t v_version; /* layout version */
+ char v_volume[LEN_DKL_VVOL]; /* volume name */
+ uint16_t v_nparts; /* number of partitions */
+ struct dk_map2 v_part[NDKMAP]; /* partition hdrs, sec 2 */
+ uint32_t v_bootinfo[3]; /* info needed by mboot */
+ uint32_t v_sanity; /* to verify vtoc sanity */
+ uint32_t v_reserved[10]; /* free space */
+ time32_t v_timestamp[NDKMAP]; /* partition timestamp */
+#else
+#error "No VTOC format defined."
+#endif
+};
+
+/*
+ * define the amount of disk label padding needed to make
+ * the entire structure occupy 512 bytes.
+ */
+#if defined(_SUNOS_VTOC_16)
+#define LEN_DKL_PAD (DK_LABEL_SIZE - \
+ ((sizeof (struct dk_vtoc) + \
+ (4 * sizeof (uint32_t)) + \
+ (12 * sizeof (uint16_t)) + \
+ (2 * (sizeof (uint16_t))))))
+#elif defined(_SUNOS_VTOC_8)
+#define LEN_DKL_PAD (DK_LABEL_SIZE \
+ - ((LEN_DKL_ASCII) + \
+ (sizeof (struct dk_vtoc)) + \
+ (sizeof (struct dk_map32) * NDKMAP) + \
+ (14 * (sizeof (uint16_t))) + \
+ (2 * (sizeof (uint16_t)))))
+#else
+#error "No VTOC format defined."
+#endif
+
+
+struct dk_label {
+#if defined(_SUNOS_VTOC_16)
+ struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */
+ uint32_t dkl_pcyl; /* # of physical cylinders */
+ uint32_t dkl_ncyl; /* # of data cylinders */
+ uint16_t dkl_acyl; /* # of alternate cylinders */
+ uint16_t dkl_bcyl; /* cyl offset (for fixed head area) */
+ uint32_t dkl_nhead; /* # of heads */
+ uint32_t dkl_nsect; /* # of data sectors per track */
+ uint16_t dkl_intrlv; /* interleave factor */
+ uint16_t dkl_skew; /* skew factor */
+ uint16_t dkl_apc; /* alternates per cyl (SCSI only) */
+ uint16_t dkl_rpm; /* revolutions per minute */
+ uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */
+ uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */
+ uint16_t dkl_extra[4]; /* for compatible expansion */
+ char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */
+#elif defined(_SUNOS_VTOC_8)
+ char dkl_asciilabel[LEN_DKL_ASCII]; /* for compatibility */
+ struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */
+ uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */
+ uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */
+ char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */
+ uint16_t dkl_rpm; /* rotations per minute */
+ uint16_t dkl_pcyl; /* # physical cylinders */
+ uint16_t dkl_apc; /* alternates per cylinder */
+ uint16_t dkl_obs1; /* obsolete */
+ uint16_t dkl_obs2; /* obsolete */
+ uint16_t dkl_intrlv; /* interleave factor */
+ uint16_t dkl_ncyl; /* # of data cylinders */
+ uint16_t dkl_acyl; /* # of alternate cylinders */
+ uint16_t dkl_nhead; /* # of heads in this partition */
+ uint16_t dkl_nsect; /* # of 512 byte sectors per track */
+ uint16_t dkl_obs3; /* obsolete */
+ uint16_t dkl_obs4; /* obsolete */
+ struct dk_map32 dkl_map[NDKMAP]; /* logical partition headers */
+#else
+#error "No VTOC format defined."
+#endif
+ uint16_t dkl_magic; /* identifies this label format */
+ uint16_t dkl_cksum; /* xor checksum of sector */
+};
+
+#if defined(_SUNOS_VTOC_16)
+#define dkl_asciilabel dkl_vtoc.v_asciilabel
+#define v_timestamp timestamp
+
+#elif defined(_SUNOS_VTOC_8)
+
+/*
+ * These defines are for historic compatibility with old drivers.
+ */
+#define dkl_gap1 dkl_obs1 /* used to be gap1 */
+#define dkl_gap2 dkl_obs2 /* used to be gap2 */
+#define dkl_bhead dkl_obs3 /* used to be label head offset */
+#define dkl_ppart dkl_obs4 /* used to by physical partition */
+#else
+#error "No VTOC format defined."
+#endif
+
+struct fk_label { /* DOS floppy label */
+ uchar_t fkl_type;
+ uchar_t fkl_magich;
+ uchar_t fkl_magicl;
+ uchar_t filler;
+};
+
+/*
+ * Layout of stored fabricated device id (on-disk)
+ */
+#define DK_DEVID_BLKSIZE (512)
+#define DK_DEVID_SIZE (DK_DEVID_BLKSIZE - ((sizeof (uchar_t) * 7)))
+#define DK_DEVID_REV_MSB (0)
+#define DK_DEVID_REV_LSB (1)
+
+struct dk_devid {
+ uchar_t dkd_rev_hi; /* revision (MSB) */
+ uchar_t dkd_rev_lo; /* revision (LSB) */
+ uchar_t dkd_flags; /* flags (not used yet) */
+ uchar_t dkd_devid[DK_DEVID_SIZE]; /* devid stored here */
+ uchar_t dkd_checksum3; /* checksum (MSB) */
+ uchar_t dkd_checksum2;
+ uchar_t dkd_checksum1;
+ uchar_t dkd_checksum0; /* checksum (LSB) */
+};
+
+#define DKD_GETCHKSUM(dkd) ((dkd)->dkd_checksum3 << 24) + \
+ ((dkd)->dkd_checksum2 << 16) + \
+ ((dkd)->dkd_checksum1 << 8) + \
+ ((dkd)->dkd_checksum0)
+
+#define DKD_FORMCHKSUM(c, dkd) (dkd)->dkd_checksum3 = hibyte(hiword((c))); \
+ (dkd)->dkd_checksum2 = lobyte(hiword((c))); \
+ (dkd)->dkd_checksum1 = hibyte(loword((c))); \
+ (dkd)->dkd_checksum0 = lobyte(loword((c)));
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DKLABEL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/dnlc.h b/sys/contrib/opensolaris/uts/common/sys/dnlc.h
new file mode 100644
index 000000000000..88b7509ec382
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/dnlc.h
@@ -0,0 +1,232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _SYS_DNLC_H
+#define _SYS_DNLC_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/kstat.h>
+
+/*
+ * DNLC - Directory name lookup cache.
+ * There are now two sorts of name caching:
+ *
+ * Standard dnlc: This original cache holds recent mappings
+ * of <directory vnode, name> to vnode mappings.
+ *
+ * Directory caches: Entire large directories can be cached, subject to
+ * memory availability and tunables. A directory cache
+ * anchor point must be provided in the xxnode for
+ * a directory.
+ */
+
+
+/*
+ * Standard dnlc
+ * =============
+ */
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up.
+ *
+ * Note namlen is a uchar_t to conserve space
+ * and alignment padding. The max length of any
+ * pathname component is defined as MAXNAMELEN
+ * which is 256 (including the terminating null).
+ * So provided this doesn't change, we don't include the null,
+ * we always use bcmp to compare strings, and we don't start
+ * storing full names, then we are ok. The space savings are worth it.
+ */
+typedef struct ncache {
+ struct ncache *hash_next; /* hash chain, MUST BE FIRST */
+ struct ncache *hash_prev;
+ struct vnode *vp; /* vnode the name refers to */
+ struct vnode *dp; /* vnode of parent of name */
+ int hash; /* hash signature */
+ uchar_t namlen; /* length of name */
+ char name[1]; /* segment name - null terminated */
+} ncache_t;
+
+/*
+ * Hash table bucket structure of name cache entries for fast lookup.
+ */
+typedef struct nc_hash {
+ ncache_t *hash_next;
+ ncache_t *hash_prev;
+ kmutex_t hash_lock;
+} nc_hash_t;
+
+/*
+ * Statistics on name cache
+ * Not protected by locks
+ */
+/*
+ * ncstats has been deprecated, due to the integer size of the counters
+ * which can easily overflow in the dnlc.
+ * It is maintained (at some expense) for compatability.
+ * The preferred interface is the kstat accessible nc_stats below, ehich
+ * is actually shared with directory caching.
+ */
+struct ncstats {
+ int hits; /* hits that we can really use */
+ int misses; /* cache misses */
+ int enters; /* number of enters done */
+ int dbl_enters; /* number of enters tried when already cached */
+ int long_enter; /* deprecated, no longer accounted */
+ int long_look; /* deprecated, no longer accounted */
+ int move_to_front; /* entry moved to front of hash chain */
+ int purges; /* number of purges of cache */
+};
+
+struct nc_stats {
+ kstat_named_t ncs_hits; /* cache hits */
+ kstat_named_t ncs_misses; /* cache misses */
+ kstat_named_t ncs_neg_hits; /* negative cache hits */
+ kstat_named_t ncs_enters; /* enters */
+ kstat_named_t ncs_dbl_enters; /* enters when entry already cached */
+ kstat_named_t ncs_purge_total; /* total entries prurged */
+ kstat_named_t ncs_purge_all; /* dnlc_purge() calls */
+ kstat_named_t ncs_purge_vp; /* dnlc_purge_vp() calls */
+ kstat_named_t ncs_purge_vfs; /* dnlc_purge_vfs() calls */
+ kstat_named_t ncs_purge_fs1; /* dnlc_purge_fs1() calls */
+ kstat_named_t ncs_pick_free; /* found a free ncache */
+ kstat_named_t ncs_pick_heur; /* found ncache w/ NULL vpages */
+ kstat_named_t ncs_pick_last; /* found last ncache on chain */
+};
+
+/*
+ * The dnlc hashing function.
+ * Although really a kernel macro we export it to allow validation
+ * of ncache_t entries by mdb. Note, mdb can handle the ASSERT.
+ *
+ * 'hash' and 'namlen' must be l-values. A check is made to ensure
+ * the name length fits into an unsigned char (see ncache_t).
+ */
+#define DNLCHASH(name, dvp, hash, namlen) \
+ { \
+ char Xc, *Xcp; \
+ hash = (int)((uintptr_t)(dvp)) >> 8; \
+ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
+ (hash) = ((hash) << 4) + (hash) + Xc; \
+ ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \
+ (namlen) = Xcp - (name); \
+ }
+
+#if defined(_KERNEL)
+
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+
+extern int ncsize; /* set in param_init() # of dnlc entries */
+extern vnode_t negative_cache_vnode;
+#define DNLC_NO_VNODE &negative_cache_vnode
+
+void dnlc_update(vnode_t *, char *, vnode_t *);
+vnode_t *dnlc_lookup(vnode_t *, char *);
+int dnlc_purge_vfsp(vfs_t *, int);
+void dnlc_remove(vnode_t *, char *);
+void dnlc_reduce_cache(void *);
+
+#endif /* defined(_KERNEL) */
+
+
+/*
+ * Directory caching interfaces
+ * ============================
+ */
+
+/*
+ * Typically for large directories, the file names will be the same or
+ * at least similar lengths. So there's no point in anything more elaborate
+ * than a simple unordered linked list of free space entries.
+ * For small directories the name length distribution doesn't really matter.
+ */
+typedef struct dcfree {
+ uint64_t df_handle; /* fs supplied handle */
+ struct dcfree *df_next; /* link to next free entry in bucket */
+ uint_t df_len; /* length of free entry */
+} dcfree_t;
+
+typedef struct dcentry {
+ uint64_t de_handle; /* fs supplied and returned data */
+ struct dcentry *de_next; /* link to next name entry in bucket */
+ int de_hash; /* hash signature */
+ uchar_t de_namelen; /* length of name excluding null */
+ char de_name[1]; /* null terminated name */
+} dcentry_t;
+
+typedef struct dircache {
+ struct dircache *dc_next; /* chain - for purge purposes */
+ struct dircache *dc_prev; /* chain - for purge purposes */
+ int64_t dc_actime; /* dir access time, from lbolt64 */
+ dcentry_t **dc_namehash; /* entry hash table pointer */
+ dcfree_t **dc_freehash; /* free entry hash table pointer */
+ uint_t dc_num_entries; /* no of named entries */
+ uint_t dc_num_free; /* no of free space entries */
+ uint_t dc_nhash_mask; /* name hash table size - 1 */
+ uint_t dc_fhash_mask; /* free space hash table size - 1 */
+ struct dcanchor *dc_anchor; /* back ptr to anchor */
+ boolean_t dc_complete; /* cache complete boolean */
+} dircache_t;
+
+typedef struct dcanchor {
+ void *dca_dircache; /* pointer to directory cache */
+ kmutex_t dca_lock; /* protects the directory cache */
+} dcanchor_t;
+
+/*
+ * Head struct for doubly linked chain of dircache_t
+ * The next and prev fields must match those of a dircache_t
+ */
+typedef struct {
+ dircache_t *dch_next; /* next in chain */
+ dircache_t *dch_prev; /* prev in chain */
+ kmutex_t dch_lock; /* lock for the chain */
+} dchead_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DNLC_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/errorq.h b/sys/contrib/opensolaris/uts/common/sys/errorq.h
new file mode 100644
index 000000000000..971b19e6ccd8
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/errorq.h
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ERRORQ_H
+#define _ERRORQ_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/nvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct errorq errorq_t;
+typedef struct errorq_elem errorq_elem_t;
+typedef void (*errorq_func_t)(void *, const void *, const errorq_elem_t *);
+
+/*
+ * Public flags for errorq_create(): bit range 0-15
+ */
+#define ERRORQ_VITAL 0x0001 /* drain queue automatically on system reset */
+
+/*
+ * Public flags for errorq_dispatch():
+ */
+#define ERRORQ_ASYNC 0 /* schedule async queue drain for caller */
+#define ERRORQ_SYNC 1 /* do not schedule drain; caller will drain */
+
+#ifdef _KERNEL
+
+extern errorq_t *errorq_create(const char *, errorq_func_t, void *,
+ ulong_t, size_t, uint_t, uint_t);
+
+extern errorq_t *errorq_nvcreate(const char *, errorq_func_t, void *,
+ ulong_t, size_t, uint_t, uint_t);
+
+extern void errorq_destroy(errorq_t *);
+extern void errorq_dispatch(errorq_t *, const void *, size_t, uint_t);
+extern void errorq_drain(errorq_t *);
+extern void errorq_init(void);
+extern void errorq_panic(void);
+extern errorq_elem_t *errorq_reserve(errorq_t *);
+extern void errorq_commit(errorq_t *, errorq_elem_t *, uint_t);
+extern void errorq_cancel(errorq_t *, errorq_elem_t *);
+extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *);
+extern nv_alloc_t *errorq_elem_nva(errorq_t *, const errorq_elem_t *);
+extern void *errorq_elem_dup(errorq_t *, const errorq_elem_t *,
+ errorq_elem_t **);
+extern void errorq_dump();
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ERRORQ_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/feature_tests.h b/sys/contrib/opensolaris/uts/common/sys/feature_tests.h
new file mode 100644
index 000000000000..bb79cb8398a2
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/feature_tests.h
@@ -0,0 +1,397 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FEATURE_TESTS_H
+#define _SYS_FEATURE_TESTS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/ccompile.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Values of _POSIX_C_SOURCE
+ *
+ * undefined not a POSIX compilation
+ * 1 POSIX.1-1990 compilation
+ * 2 POSIX.2-1992 compilation
+ * 199309L POSIX.1b-1993 compilation (Real Time)
+ * 199506L POSIX.1c-1995 compilation (POSIX Threads)
+ * 200112L POSIX.1-2001 compilation (Austin Group Revision)
+ */
+#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE)
+#define _POSIX_C_SOURCE 1
+#endif
+
+/*
+ * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, and _STDC_C99
+ * are Sun implementation specific macros created in order to compress
+ * common standards specified feature test macros for easier reading.
+ * These macros should not be used by the application developer as
+ * unexpected results may occur. Instead, the user should reference
+ * standards(5) for correct usage of the standards feature test macros.
+ *
+ * __XOPEN_OR_POSIX Used in cases where a symbol is defined by both
+ * X/Open or POSIX or in the negative, when neither
+ * X/Open or POSIX defines a symbol.
+ *
+ * _STRICT_STDC __STDC__ is specified by the C Standards and defined
+ * by the compiler. For Sun compilers the value of
+ * __STDC__ is either 1, 0, or not defined based on the
+ * compilation mode (see cc(1)). When the value of
+ * __STDC__ is 1 and in the absence of any other feature
+ * test macros, the namespace available to the application
+ * is limited to only those symbols defined by the C
+ * Standard. _STRICT_STDC provides a more readable means
+ * of identifying symbols defined by the standard, or in
+ * the negative, symbols that are extensions to the C
+ * Standard. See additional comments for GNU C differences.
+ *
+ * _STDC_C99 __STDC_VERSION__ is specified by the C standards and
+ * defined by the compiler and indicates the version of
+ * the C standard. A value of 199901L indicates a
+ * compiler that complies with ISO/IEC 9899:1999, other-
+ * wise known as the C99 standard.
+ */
+
+#if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE)
+#define __XOPEN_OR_POSIX
+#endif
+
+/*
+ * ISO/IEC 9899:1990 and it's revision, ISO/IEC 9899:1999 specify the
+ * following predefined macro name:
+ *
+ * __STDC__ The integer constant 1, intended to indicate a conforming
+ * implementation.
+ *
+ * Furthermore, a strictly conforming program shall use only those features
+ * of the language and library specified in these standards. A conforming
+ * implementation shall accept any strictly conforming program.
+ *
+ * Based on these requirements, Sun's C compiler defines __STDC__ to 1 for
+ * strictly conforming environments and __STDC__ to 0 for environments that
+ * use ANSI C semantics but allow extensions to the C standard. For non-ANSI
+ * C semantics, Sun's C compiler does not define __STDC__.
+ *
+ * The GNU C project interpretation is that __STDC__ should always be defined
+ * to 1 for compilation modes that accept ANSI C syntax regardless of whether
+ * or not extensions to the C standard are used. Violations of conforming
+ * behavior are conditionally flagged as warnings via the use of the
+ * -pedantic option. In addition to defining __STDC__ to 1, the GNU C
+ * compiler also defines __STRICT_ANSI__ as a means of specifying strictly
+ * conforming environments using the -ansi or -std=<standard> options.
+ *
+ * In the absence of any other compiler options, Sun and GNU set the value
+ * of __STDC__ as follows when using the following options:
+ *
+ * Value of __STDC__ __STRICT_ANSI__
+ *
+ * cc -Xa (default) 0 undefined
+ * cc -Xt (transitional) 0 undefined
+ * cc -Xc (strictly conforming) 1 undefined
+ * cc -Xs (K&R C) undefined undefined
+ *
+ * gcc (default) 1 undefined
+ * gcc -ansi, -std={c89, c99,...) 1 defined
+ * gcc -traditional (K&R) undefined undefined
+ *
+ * The default compilation modes for Sun C compilers versus GNU C compilers
+ * results in a differing value for __STDC__ which results in a more
+ * restricted namespace when using Sun compilers. To allow both GNU and Sun
+ * interpretations to peacefully co-exist, we use the following Sun
+ * implementation _STRICT_STDC_ macro:
+ */
+
+#if (__STDC__ - 0 == 1 && !defined(__GNUC__)) || \
+ (defined(__GNUC__) && defined(__STRICT_ANSI__))
+#define _STRICT_STDC
+#else
+#undef _STRICT_STDC
+#endif
+
+/*
+ * Compiler complies with ISO/IEC 9899:1999
+ */
+
+#if __STDC_VERSION__ - 0 >= 199901L
+#ifndef _STDC_C99
+#define _STDC_C99
+#endif
+#endif
+
+/*
+ * Large file interfaces:
+ *
+ * _LARGEFILE_SOURCE
+ * 1 large file-related additions to POSIX
+ * interfaces requested (fseeko, etc.)
+ * _LARGEFILE64_SOURCE
+ * 1 transitional large-file-related interfaces
+ * requested (seek64, stat64, etc.)
+ *
+ * The corresponding announcement macros are respectively:
+ * _LFS_LARGEFILE
+ * _LFS64_LARGEFILE
+ * (These are set in <unistd.h>.)
+ *
+ * Requesting _LARGEFILE64_SOURCE implies requesting _LARGEFILE_SOURCE as
+ * well.
+ *
+ * The large file interfaces are made visible regardless of the initial values
+ * of the feature test macros under certain circumstances:
+ * - If no explicit standards-conforming environment is requested (neither
+ * of _POSIX_SOURCE nor _XOPEN_SOURCE is defined and the value of
+ * __STDC__ does not imply standards conformance).
+ * - Extended system interfaces are explicitly requested (__EXTENSIONS__
+ * is defined).
+ * - Access to in-kernel interfaces is requested (_KERNEL or _KMEMUSER is
+ * defined). (Note that this dependency is an artifact of the current
+ * kernel implementation and may change in future releases.)
+ */
+#if (!defined(_STRICT_STDC) && !defined(__XOPEN_OR_POSIX)) || \
+ defined(_KERNEL) || defined(_KMEMUSER) || \
+ defined(__EXTENSIONS__)
+#undef _LARGEFILE64_SOURCE
+#define _LARGEFILE64_SOURCE 1
+#endif
+#if _LARGEFILE64_SOURCE - 0 == 1
+#undef _LARGEFILE_SOURCE
+#define _LARGEFILE_SOURCE 1
+#endif
+
+/*
+ * Large file compilation environment control:
+ *
+ * The setting of _FILE_OFFSET_BITS controls the size of various file-related
+ * types and governs the mapping between file-related source function symbol
+ * names and the corresponding binary entry points.
+ *
+ * In the 32-bit environment, the default value is 32; if not set, set it to
+ * the default here, to simplify tests in other headers.
+ *
+ * In the 64-bit compilation environment, the only value allowed is 64.
+ */
+#if defined(_LP64)
+#ifndef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+#endif
+#if _FILE_OFFSET_BITS - 0 != 64
+#error "invalid _FILE_OFFSET_BITS value specified"
+#endif
+#else /* _LP64 */
+#ifndef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 32
+#endif
+#if _FILE_OFFSET_BITS - 0 != 32 && _FILE_OFFSET_BITS - 0 != 64
+#error "invalid _FILE_OFFSET_BITS value specified"
+#endif
+#endif /* _LP64 */
+
+/*
+ * Use of _XOPEN_SOURCE
+ *
+ * The following X/Open specifications are supported:
+ *
+ * X/Open Portability Guide, Issue 3 (XPG3)
+ * X/Open CAE Specification, Issue 4 (XPG4)
+ * X/Open CAE Specification, Issue 4, Version 2 (XPG4v2)
+ * X/Open CAE Specification, Issue 5 (XPG5)
+ * Open Group Technical Standard, Issue 6 (XPG6), also referred to as
+ * IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002.
+ *
+ * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1).
+ * XPG5 is also referred to as UNIX 98 or the Single Unix Specification,
+ * Version 2 (SUSv2)
+ * XPG6 is the result of a merge of the X/Open and POSIX specifications
+ * and as such is also referred to as IEEE Std. 1003.1-2001 in
+ * addition to UNIX 03 and SUSv3.
+ *
+ * When writing a conforming X/Open application, as per the specification
+ * requirements, the appropriate feature test macros must be defined at
+ * compile time. These are as follows. For more info, see standards(5).
+ *
+ * Feature Test Macro Specification
+ * ------------------------------------------------ -------------
+ * _XOPEN_SOURCE XPG3
+ * _XOPEN_SOURCE && _XOPEN_VERSION = 4 XPG4
+ * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1 XPG4v2
+ * _XOPEN_SOURCE = 500 XPG5
+ * _XOPEN_SOURCE = 600 (or POSIX_C_SOURCE=200112L) XPG6
+ *
+ * In order to simplify the guards within the headers, the following
+ * implementation private test macros have been created. Applications
+ * must NOT use these private test macros as unexpected results will
+ * occur.
+ *
+ * Note that in general, the use of these private macros is cumulative.
+ * For example, the use of _XPG3 with no other restrictions on the X/Open
+ * namespace will make the symbols visible for XPG3 through XPG6
+ * compilation environments. The use of _XPG4_2 with no other X/Open
+ * namespace restrictions indicates that the symbols were introduced in
+ * XPG4v2 and are therefore visible for XPG4v2 through XPG6 compilation
+ * environments, but not for XPG3 or XPG4 compilation environments.
+ *
+ * _XPG3 X/Open Portability Guide, Issue 3 (XPG3)
+ * _XPG4 X/Open CAE Specification, Issue 4 (XPG4)
+ * _XPG4_2 X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS)
+ * _XPG5 X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2)
+ * _XPG6 Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3)
+ */
+
+/* X/Open Portability Guide, Issue 3 */
+#if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE - 0 < 500) && \
+ (_XOPEN_VERSION - 0 < 4) && !defined(_XOPEN_SOURCE_EXTENDED)
+#define _XPG3
+/* X/Open CAE Specification, Issue 4 */
+#elif (defined(_XOPEN_SOURCE) && _XOPEN_VERSION - 0 == 4)
+#define _XPG4
+#define _XPG3
+/* X/Open CAE Specification, Issue 4, Version 2 */
+#elif (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE_EXTENDED - 0 == 1)
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+/* X/Open CAE Specification, Issue 5 */
+#elif (_XOPEN_SOURCE - 0 == 500)
+#define _XPG5
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+#undef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 199506L
+/* Open Group Technical Standard , Issue 6 */
+#elif (_XOPEN_SOURCE - 0 == 600) || (_POSIX_C_SOURCE - 0 == 200112L)
+#define _XPG6
+#define _XPG5
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+#undef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200112L
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+
+/*
+ * _XOPEN_VERSION is defined by the X/Open specifications and is not
+ * normally defined by the application, except in the case of an XPG4
+ * application. On the implementation side, _XOPEN_VERSION defined with
+ * the value of 3 indicates an XPG3 application. _XOPEN_VERSION defined
+ * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application.
+ * _XOPEN_VERSION defined with a value of 500 indicates an XPG5 (UNIX 98)
+ * application and with a value of 600 indicates an XPG6 (UNIX 03)
+ * application. The appropriate version is determined by the use of the
+ * feature test macros described earlier. The value of _XOPEN_VERSION
+ * defaults to 3 otherwise indicating support for XPG3 applications.
+ */
+#ifndef _XOPEN_VERSION
+#ifdef _XPG6
+#define _XOPEN_VERSION 600
+#elif defined(_XPG5)
+#define _XOPEN_VERSION 500
+#elif defined(_XPG4_2)
+#define _XOPEN_VERSION 4
+#else
+#define _XOPEN_VERSION 3
+#endif
+#endif
+
+/*
+ * ANSI C and ISO 9899:1990 say the type long long doesn't exist in strictly
+ * conforming environments. ISO 9899:1999 says it does.
+ *
+ * The presence of _LONGLONG_TYPE says "long long exists" which is therefore
+ * defined in all but strictly conforming environments that disallow it.
+ */
+#if !defined(_STDC_C99) && defined(_STRICT_STDC) && !defined(__GNUC__)
+/*
+ * Resist attempts to force the definition of long long in this case.
+ */
+#if defined(_LONGLONG_TYPE)
+#error "No long long in strictly conforming ANSI C & 1990 ISO C environments"
+#endif
+#else
+#if !defined(_LONGLONG_TYPE)
+#define _LONGLONG_TYPE
+#endif
+#endif
+
+/*
+ * It is invalid to compile an XPG3, XPG4, XPG4v2, or XPG5 application
+ * using c99. The same is true for POSIX.1-1990, POSIX.2-1992, POSIX.1b,
+ * and POSIX.1c applications. Likewise, it is invalid to compile an XPG6
+ * or a POSIX.1-2001 application with anything other than a c99 or later
+ * compiler. Therefore, we force an error in both cases.
+ */
+#if defined(_STDC_C99) && (defined(__XOPEN_OR_POSIX) && !defined(_XPG6))
+#error "Compiler or options invalid for pre-UNIX 03 X/Open applications \
+ and pre-2001 POSIX applications"
+#elif !defined(_STDC_C99) && \
+ (defined(__XOPEN_OR_POSIX) && defined(_XPG6))
+#error "Compiler or options invalid; UNIX 03 and POSIX.1-2001 applications \
+ require the use of c99"
+#endif
+
+/*
+ * The following macro defines a value for the ISO C99 restrict
+ * keyword so that _RESTRICT_KYWD resolves to "restrict" if
+ * an ISO C99 compiler is used and "" (null string) if any other
+ * compiler is used. This allows for the use of single prototype
+ * declarations regardless of compiler version.
+ */
+#if (defined(__STDC__) && defined(_STDC_C99))
+#define _RESTRICT_KYWD restrict
+#else
+#define _RESTRICT_KYWD
+#endif
+
+/*
+ * The following macro indicates header support for the ANSI C++
+ * standard. The ISO/IEC designation for this is ISO/IEC FDIS 14882.
+ */
+#define _ISO_CPP_14882_1998
+
+/*
+ * The following macro indicates header support for the C99 standard,
+ * ISO/IEC 9899:1999, Programming Languages - C.
+ */
+#define _ISO_C_9899_1999
+
+/*
+ * The following macro indicates header support for DTrace. The value is an
+ * integer that corresponds to the major version number for DTrace.
+ */
+#define _DTRACE_VERSION 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FEATURE_TESTS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
new file mode 100644
index 000000000000..aa5c7ee0d788
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FM_FS_ZFS_H
+#define _SYS_FM_FS_ZFS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_ERROR_CLASS "fs.zfs"
+
+#define FM_EREPORT_ZFS_CHECKSUM "checksum"
+#define FM_EREPORT_ZFS_IO "io"
+#define FM_EREPORT_ZFS_DATA "data"
+#define FM_EREPORT_ZFS_POOL "zpool"
+#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
+#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
+#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data"
+#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas"
+#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum"
+#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small"
+#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
+
+#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
+#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
+
+#define FM_RESOURCE_OK "ok"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_FS_ZFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h
new file mode 100644
index 000000000000..a9980feda074
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FM_PROTOCOL_H
+#define _SYS_FM_PROTOCOL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+#include <sys/varargs.h>
+#include <sys/nvpair.h>
+#else
+#include <libnvpair.h>
+#include <stdarg.h>
+#endif
+
+/* FM common member names */
+#define FM_CLASS "class"
+#define FM_VERSION "version"
+
+/* FM event class values */
+#define FM_EREPORT_CLASS "ereport"
+#define FM_FAULT_CLASS "fault"
+#define FM_RSRC_CLASS "resource"
+#define FM_LIST_EVENT "list"
+
+/* FM list.* event class values */
+#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
+#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated"
+#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired"
+
+/* ereport class subcategory values */
+#define FM_ERROR_CPU "cpu"
+#define FM_ERROR_IO "io"
+
+/* ereport version and payload member names */
+#define FM_EREPORT_VERS0 0
+#define FM_EREPORT_VERSION FM_EREPORT_VERS0
+
+/* ereport payload member names */
+#define FM_EREPORT_DETECTOR "detector"
+#define FM_EREPORT_ENA "ena"
+
+/* list.* event payload member names */
+#define FM_LIST_EVENT_SIZE "list-sz"
+
+/* list.suspect, isolated, and repaired versions and payload member names */
+#define FM_SUSPECT_UUID "uuid"
+#define FM_SUSPECT_DIAG_CODE "code"
+#define FM_SUSPECT_DIAG_TIME "diag-time"
+#define FM_SUSPECT_DE "de"
+#define FM_SUSPECT_FAULT_LIST "fault-list"
+#define FM_SUSPECT_FAULT_SZ "fault-list-sz"
+#define FM_SUSPECT_FAULT_STATUS "fault-status"
+#define FM_SUSPECT_MESSAGE "message"
+
+#define FM_SUSPECT_VERS0 0
+#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0
+
+/* fault event versions and payload member names */
+#define FM_FAULT_VERS0 0
+#define FM_FAULT_VERSION FM_FAULT_VERS0
+
+#define FM_FAULT_ASRU "asru"
+#define FM_FAULT_FRU "fru"
+#define FM_FAULT_FRU_LABEL "fru-label"
+#define FM_FAULT_CERTAINTY "certainty"
+#define FM_FAULT_RESOURCE "resource"
+#define FM_FAULT_LOCATION "location"
+
+/* resource event versions and payload member names */
+#define FM_RSRC_VERS0 0
+#define FM_RSRC_VERSION FM_RSRC_VERS0
+#define FM_RSRC_RESOURCE "resource"
+
+/* resource.fm.asru.* payload member names */
+#define FM_RSRC_ASRU_UUID "uuid"
+#define FM_RSRC_ASRU_CODE "code"
+#define FM_RSRC_ASRU_FAULTY "faulty"
+#define FM_RSRC_ASRU_UNUSABLE "unusable"
+#define FM_RSRC_ASRU_EVENT "event"
+
+/* resource.fm.xprt.* versions and payload member names */
+#define FM_RSRC_XPRT_VERS0 0
+#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0
+#define FM_RSRC_XPRT_UUID "uuid"
+#define FM_RSRC_XPRT_SUBCLASS "subclass"
+
+/*
+ * FM ENA Format Macros
+ */
+#define ENA_FORMAT_MASK 0x3
+#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK)
+
+/* ENA format types */
+#define FM_ENA_FMT0 0
+#define FM_ENA_FMT1 1
+#define FM_ENA_FMT2 2
+
+/* Format 1 */
+#define ENA_FMT1_GEN_MASK 0x00000000000003FCull
+#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull
+#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull
+#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull
+#define ENA_FMT1_GEN_SHFT 2
+#define ENA_FMT1_ID_SHFT 10
+#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT
+#define ENA_FMT1_TIME_SHFT 20
+
+/* Format 2 */
+#define ENA_FMT2_GEN_MASK 0x00000000000003FCull
+#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull
+#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK
+#define ENA_FMT2_GEN_SHFT 2
+#define ENA_FMT2_ID_SHFT 10
+#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT
+
+/* Common FMRI type names */
+#define FM_FMRI_AUTHORITY "authority"
+#define FM_FMRI_SCHEME "scheme"
+#define FM_FMRI_SVC_AUTHORITY "svc-authority"
+
+/* FMRI authority-type member names */
+#define FM_FMRI_AUTH_CHASSIS "chassis-id"
+#define FM_FMRI_AUTH_PRODUCT "product-id"
+#define FM_FMRI_AUTH_DOMAIN "domain-id"
+#define FM_FMRI_AUTH_SERVER "server-id"
+#define FM_FMRI_AUTH_HOST "host-id"
+
+#define FM_AUTH_VERS0 0
+#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0
+
+/* scheme name values */
+#define FM_FMRI_SCHEME_FMD "fmd"
+#define FM_FMRI_SCHEME_DEV "dev"
+#define FM_FMRI_SCHEME_HC "hc"
+#define FM_FMRI_SCHEME_SVC "svc"
+#define FM_FMRI_SCHEME_CPU "cpu"
+#define FM_FMRI_SCHEME_MEM "mem"
+#define FM_FMRI_SCHEME_MOD "mod"
+#define FM_FMRI_SCHEME_PKG "pkg"
+#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
+#define FM_FMRI_SCHEME_ZFS "zfs"
+
+/* Scheme versions */
+#define FMD_SCHEME_VERSION0 0
+#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0
+#define DEV_SCHEME_VERSION0 0
+#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0
+#define FM_HC_VERS0 0
+#define FM_HC_SCHEME_VERSION FM_HC_VERS0
+#define CPU_SCHEME_VERSION0 0
+#define CPU_SCHEME_VERSION1 1
+#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1
+#define MEM_SCHEME_VERSION0 0
+#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0
+#define MOD_SCHEME_VERSION0 0
+#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0
+#define PKG_SCHEME_VERSION0 0
+#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
+#define LEGACY_SCHEME_VERSION0 0
+#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
+#define ZFS_SCHEME_VERSION0 0
+#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
+
+/* hc scheme member names */
+#define FM_FMRI_HC_SERIAL_ID "serial"
+#define FM_FMRI_HC_PART "part"
+#define FM_FMRI_HC_REVISION "revision"
+#define FM_FMRI_HC_ROOT "hc-root"
+#define FM_FMRI_HC_LIST_SZ "hc-list-sz"
+#define FM_FMRI_HC_LIST "hc-list"
+#define FM_FMRI_HC_SPECIFIC "hc-specific"
+
+/* hc-list version and member names */
+#define FM_FMRI_HC_NAME "hc-name"
+#define FM_FMRI_HC_ID "hc-id"
+
+#define HC_LIST_VERSION0 0
+#define FM_HC_LIST_VERSION HC_LIST_VERSION0
+
+/* hc-specific member names */
+#define FM_FMRI_HC_SPECIFIC_OFFSET "offset"
+
+/* fmd module scheme member names */
+#define FM_FMRI_FMD_NAME "mod-name"
+#define FM_FMRI_FMD_VERSION "mod-version"
+
+/* dev scheme member names */
+#define FM_FMRI_DEV_ID "devid"
+#define FM_FMRI_DEV_PATH "device-path"
+
+/* pkg scheme member names */
+#define FM_FMRI_PKG_BASEDIR "pkg-basedir"
+#define FM_FMRI_PKG_INST "pkg-inst"
+#define FM_FMRI_PKG_VERSION "pkg-version"
+
+/* svc scheme member names */
+#define FM_FMRI_SVC_NAME "service-name"
+#define FM_FMRI_SVC_VERSION "service-version"
+#define FM_FMRI_SVC_INSTANCE "instance"
+#define FM_FMRI_SVC_CONTRACT_ID "contract-id"
+
+/* svc-authority member names */
+#define FM_FMRI_SVC_AUTH_SCOPE "scope"
+#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-FQN"
+
+/* cpu scheme member names */
+#define FM_FMRI_CPU_ID "cpuid"
+#define FM_FMRI_CPU_SERIAL_ID "serial"
+#define FM_FMRI_CPU_MASK "cpumask"
+#define FM_FMRI_CPU_VID "cpuvid"
+#define FM_FMRI_CPU_CPUFRU "cpufru"
+
+/* legacy-hc scheme member names */
+#define FM_FMRI_LEGACY_HC "component"
+#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \
+ FM_FMRI_LEGACY_HC"="
+
+/* mem scheme member names */
+#define FM_FMRI_MEM_UNUM "unum"
+#define FM_FMRI_MEM_SERIAL_ID "serial"
+#define FM_FMRI_MEM_PHYSADDR "physaddr"
+#define FM_FMRI_MEM_MEMCONFIG "memconfig"
+#define FM_FMRI_MEM_OFFSET "offset"
+
+/* mod scheme member names */
+#define FM_FMRI_MOD_PKG "mod-pkg"
+#define FM_FMRI_MOD_NAME "mod-name"
+#define FM_FMRI_MOD_ID "mod-id"
+#define FM_FMRI_MOD_DESC "mod-desc"
+
+/* zfs scheme member names */
+#define FM_FMRI_ZFS_POOL "pool"
+#define FM_FMRI_ZFS_VDEV "vdev"
+
+extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
+extern void fm_nva_xdestroy(nv_alloc_t *);
+
+extern nvlist_t *fm_nvlist_create(nv_alloc_t *);
+extern void fm_nvlist_destroy(nvlist_t *, int);
+
+#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */
+#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */
+
+extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t,
+ const nvlist_t *, ...);
+extern void fm_payload_set(nvlist_t *, ...);
+extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
+extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+ int, ...);
+extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
+ const char *);
+extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
+extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
+ uint8_t *, const char *);
+extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
+ const char *, uint64_t);
+extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
+ const char *, const char *);
+extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+
+extern uint64_t fm_ena_increment(uint64_t);
+extern uint64_t fm_ena_generate(uint64_t, uchar_t);
+extern uint64_t fm_ena_generation_get(uint64_t);
+extern uchar_t fm_ena_format_get(uint64_t);
+extern uint64_t fm_ena_id_get(uint64_t);
+extern uint64_t fm_ena_time_get(uint64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_PROTOCOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/contrib/opensolaris/uts/common/sys/fm/util.h
new file mode 100644
index 000000000000..f65e0ab4b474
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/fm/util.h
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FM_UTIL_H
+#define _SYS_FM_UTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/nvpair.h>
+#include <sys/errorq.h>
+
+/*
+ * Shared user/kernel definitions for class length, error channel name,
+ * and kernel event publisher string.
+ */
+#define FM_MAX_CLASS 100
+#define FM_ERROR_CHAN "com.sun:fm:error"
+#define FM_PUB "fm"
+
+/*
+ * ereport dump device transport support
+ *
+ * Ereports are written out to the dump device at a proscribed offset from the
+ * end, similar to in-transit log messages. The ereports are represented as a
+ * erpt_dump_t header followed by ed_size bytes of packed native nvlist data.
+ *
+ * NOTE: All of these constants and the header must be defined so they have the
+ * same representation for *both* 32-bit and 64-bit producers and consumers.
+ */
+#define ERPT_MAGIC 0xf00d4eddU
+#define ERPT_MAX_ERRS 16
+#define ERPT_DATA_SZ (6 * 1024)
+#define ERPT_EVCH_MAX 256
+#define ERPT_HIWAT 64
+
+typedef struct erpt_dump {
+ uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */
+ uint32_t ed_chksum; /* checksum32() of packed nvlist data */
+ uint32_t ed_size; /* ereport (nvl) fixed buf size */
+ uint32_t ed_pad; /* reserved for future use */
+ hrtime_t ed_hrt_nsec; /* hrtime of this ereport */
+ hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */
+ struct {
+ uint64_t sec; /* seconds since gettimeofday() Epoch */
+ uint64_t nsec; /* nanoseconds past ed_tod_base.sec */
+ } ed_tod_base;
+} erpt_dump_t;
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+
+#define FM_STK_DEPTH 20 /* maximum stack depth */
+#define FM_SYM_SZ 64 /* maximum symbol size */
+#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */
+
+#define FM_EREPORT_PAYLOAD_NAME_STACK "stack"
+
+extern errorq_t *ereport_errorq;
+extern void *ereport_dumpbuf;
+extern size_t ereport_dumplen;
+
+extern void fm_init(void);
+extern void fm_nvprint(nvlist_t *);
+extern void fm_panic(const char *, ...);
+extern void fm_banner(void);
+
+extern void fm_ereport_dump(void);
+extern void fm_ereport_post(nvlist_t *, int);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_UTIL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h
new file mode 100644
index 000000000000..715537df7155
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -0,0 +1,434 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_H
+#define _SYS_FS_ZFS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/ioccom.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Types and constants shared between userland and the kernel.
+ */
+
+/*
+ * Each dataset can be one of the following types. These constants can be
+ * combined into masks that can be passed to various functions.
+ */
+typedef enum {
+ ZFS_TYPE_FILESYSTEM = 0x1,
+ ZFS_TYPE_SNAPSHOT = 0x2,
+ ZFS_TYPE_VOLUME = 0x4,
+ ZFS_TYPE_POOL = 0x8
+} zfs_type_t;
+
+#define ZFS_TYPE_ANY \
+ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
+
+/*
+ * Properties are identified by these constants and must be added to the
+ * end of this list to ensure that external conumsers are not affected
+ * by the change. The property list also determines how 'zfs get' will
+ * display them. If you make any changes to this list, be sure to update
+ * the property table in usr/src/common/zfs/zfs_prop.c.
+ */
+typedef enum {
+ ZFS_PROP_CONT = -2,
+ ZFS_PROP_INVAL = -1,
+ ZFS_PROP_TYPE,
+ ZFS_PROP_CREATION,
+ ZFS_PROP_USED,
+ ZFS_PROP_AVAILABLE,
+ ZFS_PROP_REFERENCED,
+ ZFS_PROP_COMPRESSRATIO,
+ ZFS_PROP_MOUNTED,
+ ZFS_PROP_ORIGIN,
+ ZFS_PROP_QUOTA,
+ ZFS_PROP_RESERVATION,
+ ZFS_PROP_VOLSIZE,
+ ZFS_PROP_VOLBLOCKSIZE,
+ ZFS_PROP_RECORDSIZE,
+ ZFS_PROP_MOUNTPOINT,
+ ZFS_PROP_SHARENFS,
+ ZFS_PROP_CHECKSUM,
+ ZFS_PROP_COMPRESSION,
+ ZFS_PROP_ATIME,
+ ZFS_PROP_DEVICES,
+ ZFS_PROP_EXEC,
+ ZFS_PROP_SETUID,
+ ZFS_PROP_READONLY,
+ ZFS_PROP_ZONED,
+ ZFS_PROP_SNAPDIR,
+ ZFS_PROP_ACLMODE,
+ ZFS_PROP_ACLINHERIT,
+ ZFS_PROP_CREATETXG, /* not exposed to the user */
+ ZFS_PROP_NAME, /* not exposed to the user */
+ ZFS_PROP_CANMOUNT,
+ ZFS_PROP_SHAREISCSI,
+ ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
+ ZFS_PROP_XATTR,
+ ZFS_PROP_NUMCLONES, /* not exposed to the user */
+ ZFS_PROP_COPIES,
+ ZFS_PROP_BOOTFS
+} zfs_prop_t;
+
+typedef zfs_prop_t zpool_prop_t;
+
+#define ZFS_PROP_VALUE "value"
+#define ZFS_PROP_SOURCE "source"
+
+typedef enum {
+ ZFS_SRC_NONE = 0x1,
+ ZFS_SRC_DEFAULT = 0x2,
+ ZFS_SRC_TEMPORARY = 0x4,
+ ZFS_SRC_LOCAL = 0x8,
+ ZFS_SRC_INHERITED = 0x10
+} zfs_source_t;
+
+#define ZFS_SRC_ALL 0x1f
+
+/*
+ * The following functions are shared between libzfs and the kernel.
+ */
+zfs_prop_t zfs_name_to_prop(const char *);
+zpool_prop_t zpool_name_to_prop(const char *);
+boolean_t zfs_prop_user(const char *);
+int zfs_prop_readonly(zfs_prop_t);
+const char *zfs_prop_default_string(zfs_prop_t);
+const char *zfs_prop_to_name(zfs_prop_t);
+const char *zpool_prop_to_name(zfs_prop_t);
+uint64_t zfs_prop_default_numeric(zfs_prop_t);
+int zfs_prop_inheritable(zfs_prop_t);
+int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
+
+/*
+ * Property Iterator
+ */
+typedef zfs_prop_t (*zfs_prop_f)(zfs_prop_t, void *);
+typedef zfs_prop_f zpool_prop_f;
+extern zfs_prop_t zfs_prop_iter(zfs_prop_f, void *, boolean_t);
+extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t);
+
+/*
+ * On-disk version number.
+ */
+#define ZFS_VERSION_1 1ULL
+#define ZFS_VERSION_2 2ULL
+#define ZFS_VERSION_3 3ULL
+#define ZFS_VERSION_4 4ULL
+#define ZFS_VERSION_5 5ULL
+#define ZFS_VERSION_6 6ULL
+/*
+ * When bumping up ZFS_VERSION, make sure GRUB ZFS understand the on-disk
+ * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
+ * and do the appropriate changes.
+ */
+#define ZFS_VERSION ZFS_VERSION_6
+#define ZFS_VERSION_STRING "6"
+
+/*
+ * Symbolic names for the changes that caused a ZFS_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current ZFS_VERSION in your code, be sure
+ * to use spa_version() since it reports the version of the
+ * last synced uberblock. Checking the in-flight version can
+ * be dangerous in some cases.
+ */
+#define ZFS_VERSION_INITIAL ZFS_VERSION_1
+#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2
+#define ZFS_VERSION_SPARES ZFS_VERSION_3
+#define ZFS_VERSION_RAID6 ZFS_VERSION_3
+#define ZFS_VERSION_BPLIST_ACCOUNT ZFS_VERSION_3
+#define ZFS_VERSION_RAIDZ_DEFLATE ZFS_VERSION_3
+#define ZFS_VERSION_DNODE_BYTES ZFS_VERSION_3
+#define ZFS_VERSION_ZPOOL_HISTORY ZFS_VERSION_4
+#define ZFS_VERSION_GZIP_COMPRESSION ZFS_VERSION_5
+#define ZFS_VERSION_BOOTFS ZFS_VERSION_6
+
+/*
+ * The following are configuration names used in the nvlist describing a pool's
+ * configuration.
+ */
+#define ZPOOL_CONFIG_VERSION "version"
+#define ZPOOL_CONFIG_POOL_NAME "name"
+#define ZPOOL_CONFIG_POOL_STATE "state"
+#define ZPOOL_CONFIG_POOL_TXG "txg"
+#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
+#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
+#define ZPOOL_CONFIG_TOP_GUID "top_guid"
+#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
+#define ZPOOL_CONFIG_TYPE "type"
+#define ZPOOL_CONFIG_CHILDREN "children"
+#define ZPOOL_CONFIG_ID "id"
+#define ZPOOL_CONFIG_GUID "guid"
+#define ZPOOL_CONFIG_PATH "path"
+#define ZPOOL_CONFIG_DEVID "devid"
+#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
+#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
+#define ZPOOL_CONFIG_ASHIFT "ashift"
+#define ZPOOL_CONFIG_ASIZE "asize"
+#define ZPOOL_CONFIG_DTL "DTL"
+#define ZPOOL_CONFIG_STATS "stats"
+#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
+#define ZPOOL_CONFIG_OFFLINE "offline"
+#define ZPOOL_CONFIG_ERRCOUNT "error_count"
+#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
+#define ZPOOL_CONFIG_SPARES "spares"
+#define ZPOOL_CONFIG_IS_SPARE "is_spare"
+#define ZPOOL_CONFIG_NPARITY "nparity"
+
+#define VDEV_TYPE_ROOT "root"
+#define VDEV_TYPE_MIRROR "mirror"
+#define VDEV_TYPE_REPLACING "replacing"
+#define VDEV_TYPE_RAIDZ "raidz"
+#define VDEV_TYPE_DISK "disk"
+#define VDEV_TYPE_FILE "file"
+#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_SPARE "spare"
+
+/*
+ * This is needed in userland to report the minimum necessary device size.
+ */
+#define SPA_MINDEVSIZE (64ULL << 20)
+
+/*
+ * The location of the pool configuration repository, shared between kernel and
+ * userland.
+ */
+#define ZPOOL_CACHE_DIR "/etc/zfs"
+#define ZPOOL_CACHE_FILE "zpool.cache"
+#define ZPOOL_CACHE_TMP ".zpool.cache"
+
+#define ZPOOL_CACHE ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE
+
+/*
+ * vdev states are ordered from least to most healthy.
+ * A vdev that's CANT_OPEN or below is considered unusable.
+ */
+typedef enum vdev_state {
+ VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
+ VDEV_STATE_CLOSED, /* Not currently open */
+ VDEV_STATE_OFFLINE, /* Not allowed to open */
+ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
+ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
+ VDEV_STATE_HEALTHY /* Presumed good */
+} vdev_state_t;
+
+/*
+ * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
+ * of the vdev stats structure uses these constants to distinguish why.
+ */
+typedef enum vdev_aux {
+ VDEV_AUX_NONE, /* no error */
+ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
+ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
+ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
+ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
+ VDEV_AUX_TOO_SMALL, /* vdev size is too small */
+ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
+ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
+ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
+ VDEV_AUX_SPARED /* hot spare used in another pool */
+} vdev_aux_t;
+
+/*
+ * pool state. The following states are written to disk as part of the normal
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
+ * software abstractions used at various levels to communicate pool state.
+ */
+typedef enum pool_state {
+ POOL_STATE_ACTIVE = 0, /* In active use */
+ POOL_STATE_EXPORTED, /* Explicitly exported */
+ POOL_STATE_DESTROYED, /* Explicitly destroyed */
+ POOL_STATE_SPARE, /* Reserved for hot spare use */
+ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
+ POOL_STATE_UNAVAIL, /* Internal libzfs state */
+ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
+} pool_state_t;
+
+/*
+ * Scrub types.
+ */
+typedef enum pool_scrub_type {
+ POOL_SCRUB_NONE,
+ POOL_SCRUB_RESILVER,
+ POOL_SCRUB_EVERYTHING,
+ POOL_SCRUB_TYPES
+} pool_scrub_type_t;
+
+/*
+ * ZIO types. Needed to interpret vdev statistics below.
+ */
+typedef enum zio_type {
+ ZIO_TYPE_NULL = 0,
+ ZIO_TYPE_READ,
+ ZIO_TYPE_WRITE,
+ ZIO_TYPE_FREE,
+ ZIO_TYPE_CLAIM,
+ ZIO_TYPE_IOCTL,
+ ZIO_TYPES
+} zio_type_t;
+
+/*
+ * Vdev statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct vdev_stat {
+ hrtime_t vs_timestamp; /* time since vdev load */
+ uint64_t vs_state; /* vdev state */
+ uint64_t vs_aux; /* see vdev_aux_t */
+ uint64_t vs_alloc; /* space allocated */
+ uint64_t vs_space; /* total capacity */
+ uint64_t vs_dspace; /* deflated capacity */
+ uint64_t vs_rsize; /* replaceable dev size */
+ uint64_t vs_ops[ZIO_TYPES]; /* operation count */
+ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
+ uint64_t vs_read_errors; /* read errors */
+ uint64_t vs_write_errors; /* write errors */
+ uint64_t vs_checksum_errors; /* checksum errors */
+ uint64_t vs_self_healed; /* self-healed bytes */
+ uint64_t vs_scrub_type; /* pool_scrub_type_t */
+ uint64_t vs_scrub_complete; /* completed? */
+ uint64_t vs_scrub_examined; /* bytes examined; top */
+ uint64_t vs_scrub_repaired; /* bytes repaired; leaf */
+ uint64_t vs_scrub_errors; /* errors during scrub */
+ uint64_t vs_scrub_start; /* UTC scrub start time */
+ uint64_t vs_scrub_end; /* UTC scrub end time */
+} vdev_stat_t;
+
+#define ZFS_DRIVER "zfs"
+#define ZFS_DEV_NAME "zfs"
+#define ZFS_DEV "/dev/" ZFS_DEV_NAME
+
+/*
+ * zvol paths. Irritatingly, the devfsadm interfaces want all these
+ * paths without the /dev prefix, but for some things, we want the
+ * /dev prefix. Below are the names without /dev.
+ */
+#define ZVOL_DEV_DIR "zvol"
+
+/*
+ * And here are the things we need with /dev, etc. in front of them.
+ */
+#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:"
+#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR
+
+#define ZVOL_PROP_NAME "name"
+
+/*
+ * /dev/zfs ioctl numbers.
+ */
+typedef unsigned long zfs_ioc_t;
+
+#define ZFS_IOC(ioreq) ((ioreq) & 0xff)
+
+#define ZFS_IOC_POOL_CREATE _IOWR('Z', 0, struct zfs_cmd)
+#define ZFS_IOC_POOL_DESTROY _IOWR('Z', 1, struct zfs_cmd)
+#define ZFS_IOC_POOL_IMPORT _IOWR('Z', 2, struct zfs_cmd)
+#define ZFS_IOC_POOL_EXPORT _IOWR('Z', 3, struct zfs_cmd)
+#define ZFS_IOC_POOL_CONFIGS _IOWR('Z', 4, struct zfs_cmd)
+#define ZFS_IOC_POOL_STATS _IOWR('Z', 5, struct zfs_cmd)
+#define ZFS_IOC_POOL_TRYIMPORT _IOWR('Z', 6, struct zfs_cmd)
+#define ZFS_IOC_POOL_SCRUB _IOWR('Z', 7, struct zfs_cmd)
+#define ZFS_IOC_POOL_FREEZE _IOWR('Z', 8, struct zfs_cmd)
+#define ZFS_IOC_POOL_UPGRADE _IOWR('Z', 9, struct zfs_cmd)
+#define ZFS_IOC_POOL_GET_HISTORY _IOWR('Z', 10, struct zfs_cmd)
+#define ZFS_IOC_POOL_LOG_HISTORY _IOWR('Z', 11, struct zfs_cmd)
+#define ZFS_IOC_VDEV_ADD _IOWR('Z', 12, struct zfs_cmd)
+#define ZFS_IOC_VDEV_REMOVE _IOWR('Z', 13, struct zfs_cmd)
+#define ZFS_IOC_VDEV_ONLINE _IOWR('Z', 14, struct zfs_cmd)
+#define ZFS_IOC_VDEV_OFFLINE _IOWR('Z', 15, struct zfs_cmd)
+#define ZFS_IOC_VDEV_ATTACH _IOWR('Z', 16, struct zfs_cmd)
+#define ZFS_IOC_VDEV_DETACH _IOWR('Z', 17, struct zfs_cmd)
+#define ZFS_IOC_VDEV_SETPATH _IOWR('Z', 18, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 19, struct zfs_cmd)
+#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd)
+#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 21, struct zfs_cmd)
+#define ZFS_IOC_SET_PROP _IOWR('Z', 22, struct zfs_cmd)
+#define ZFS_IOC_CREATE_MINOR _IOWR('Z', 23, struct zfs_cmd)
+#define ZFS_IOC_REMOVE_MINOR _IOWR('Z', 24, struct zfs_cmd)
+#define ZFS_IOC_CREATE _IOWR('Z', 25, struct zfs_cmd)
+#define ZFS_IOC_DESTROY _IOWR('Z', 26, struct zfs_cmd)
+#define ZFS_IOC_ROLLBACK _IOWR('Z', 27, struct zfs_cmd)
+#define ZFS_IOC_RENAME _IOWR('Z', 28, struct zfs_cmd)
+#define ZFS_IOC_RECVBACKUP _IOWR('Z', 29, struct zfs_cmd)
+#define ZFS_IOC_SENDBACKUP _IOWR('Z', 30, struct zfs_cmd)
+#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 31, struct zfs_cmd)
+#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 32, struct zfs_cmd)
+#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 33, struct zfs_cmd)
+#define ZFS_IOC_ERROR_LOG _IOWR('Z', 34, struct zfs_cmd)
+#define ZFS_IOC_CLEAR _IOWR('Z', 35, struct zfs_cmd)
+#define ZFS_IOC_PROMOTE _IOWR('Z', 36, struct zfs_cmd)
+#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 37, struct zfs_cmd)
+#define ZFS_IOC_SNAPSHOT _IOWR('Z', 38, struct zfs_cmd)
+#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 39, struct zfs_cmd)
+#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 40, struct zfs_cmd)
+#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 41, struct zfs_cmd)
+#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 42, struct zfs_cmd)
+#define ZFS_IOC_JAIL _IOWR('Z', 43, struct zfs_cmd)
+#define ZFS_IOC_UNJAIL _IOWR('Z', 44, struct zfs_cmd)
+
+/*
+ * Internal SPA load state. Used by FMA diagnosis engine.
+ */
+typedef enum {
+ SPA_LOAD_NONE, /* no load in progress */
+ SPA_LOAD_OPEN, /* normal open */
+ SPA_LOAD_IMPORT, /* import in progress */
+ SPA_LOAD_TRYIMPORT /* tryimport in progress */
+} spa_load_state_t;
+
+/*
+ * Bookmark name values.
+ */
+#define ZPOOL_ERR_LIST "error list"
+#define ZPOOL_ERR_DATASET "dataset"
+#define ZPOOL_ERR_OBJECT "object"
+
+#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
+
+/*
+ * The following are names used in the nvlist describing
+ * the pool's history log.
+ */
+#define ZPOOL_HIST_RECORD "history record"
+#define ZPOOL_HIST_TIME "history time"
+#define ZPOOL_HIST_CMD "history command"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/gfs.h b/sys/contrib/opensolaris/uts/common/sys/gfs.h
new file mode 100644
index 000000000000..8e70f2993ecf
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/gfs.h
@@ -0,0 +1,139 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_GFS_H
+#define _SYS_GFS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/vnode.h>
+#include <sys/mutex.h>
+#include <sys/dirent.h>
+#include <sys/uio.h>
+#include <sys/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GFS_CACHE_VNODE 0x1
+
+typedef struct gfs_dirent {
+ char *gfse_name; /* entry name */
+ vnode_t *(*gfse_ctor)(vnode_t *); /* constructor */
+ int gfse_flags; /* flags */
+ list_node_t gfse_link; /* dynamic list */
+ vnode_t *gfse_vnode; /* cached vnode */
+} gfs_dirent_t;
+
+typedef enum gfs_type {
+ GFS_DIR,
+ GFS_FILE
+} gfs_type_t;
+
+typedef struct gfs_file {
+ vnode_t *gfs_vnode; /* current vnode */
+ vnode_t *gfs_parent; /* parent vnode */
+ size_t gfs_size; /* size of private data structure */
+ gfs_type_t gfs_type; /* type of vnode */
+ int gfs_index; /* index in parent dir */
+ ino64_t gfs_ino; /* inode for this vnode */
+} gfs_file_t;
+
+typedef int (*gfs_readdir_cb)(vnode_t *, struct dirent64 *, int *, offset_t *,
+ offset_t *, void *);
+typedef int (*gfs_lookup_cb)(vnode_t *, const char *, vnode_t **, ino64_t *);
+typedef ino64_t (*gfs_inode_cb)(vnode_t *, int);
+
+typedef struct gfs_dir {
+ gfs_file_t gfsd_file; /* generic file attributes */
+ gfs_dirent_t *gfsd_static; /* statically defined entries */
+ int gfsd_nstatic; /* # static entries */
+ kmutex_t gfsd_lock; /* protects entries */
+ int gfsd_maxlen; /* maximum name length */
+ gfs_readdir_cb gfsd_readdir; /* readdir() callback */
+ gfs_lookup_cb gfsd_lookup; /* lookup() callback */
+ gfs_inode_cb gfsd_inode; /* get an inode number */
+} gfs_dir_t;
+
+struct vfs;
+
+extern vnode_t *gfs_file_create(size_t, vnode_t *, vfs_t *, vnodeops_t *);
+extern vnode_t *gfs_dir_create(size_t, vnode_t *, vfs_t *, vnodeops_t *,
+ gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb);
+extern vnode_t *gfs_root_create(size_t, vfs_t *, vnodeops_t *, ino64_t,
+ gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb);
+extern vnode_t *gfs_root_create_file(size_t, struct vfs *, vnodeops_t *,
+ ino64_t);
+
+extern void *gfs_file_inactive(vnode_t *);
+extern void *gfs_dir_inactive(vnode_t *);
+
+extern int gfs_dir_lookup(vnode_t *, const char *, vnode_t **);
+extern int gfs_dir_readdir(vnode_t *, uio_t *, int *, int *, u_long **, void *);
+
+#define gfs_dir_lock(gd) mutex_enter(&(gd)->gfsd_lock)
+#define gfs_dir_unlock(gd) mutex_exit(&(gd)->gfsd_lock)
+
+#define gfs_file_parent(vp) (((gfs_file_t *)(vp)->v_data)->gfs_parent)
+
+#define gfs_file_index(vp) (((gfs_file_t *)(vp)->v_data)->gfs_index)
+#define gfs_file_set_index(vp, idx) \
+ (((gfs_file_t *)(vp)->v_data)->gfs_index = (idx))
+
+#define gfs_file_inode(vp) (((gfs_file_t *)(vp)->v_data)->gfs_ino)
+#define gfs_file_set_inode(vp, ino) \
+ (((gfs_file_t *)(vp)->v_data)->gfs_ino = (ino))
+
+typedef struct gfs_readdir_state {
+ struct dirent64 *grd_dirent; /* directory entry buffer */
+ size_t grd_namlen; /* max file name length */
+ size_t grd_ureclen; /* exported record size */
+ ssize_t grd_oresid; /* original uio_resid */
+ ino64_t grd_parent; /* inode of parent */
+ ino64_t grd_self; /* inode of self */
+} gfs_readdir_state_t;
+
+extern int gfs_readdir_init(gfs_readdir_state_t *, int, int, uio_t *, ino64_t,
+ ino64_t);
+extern int gfs_readdir_emit(gfs_readdir_state_t *, uio_t *, offset_t, ino64_t,
+ const char *, int *, u_long **);
+extern int gfs_readdir_pred(gfs_readdir_state_t *, uio_t *, offset_t *, int *,
+ u_long **);
+extern int gfs_readdir_fini(gfs_readdir_state_t *, int, int *, int);
+
+extern int gfs_lookup_dot(vnode_t **, vnode_t *, vnode_t *, const char *);
+
+extern int gfs_vop_readdir(struct vop_readdir_args *);
+extern int gfs_vop_inactive(struct vop_inactive_args *);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_GFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/contrib/opensolaris/uts/common/sys/isa_defs.h
new file mode 100644
index 000000000000..08fcbd02e5c0
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/isa_defs.h
@@ -0,0 +1,479 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ISA_DEFS_H
+#define _SYS_ISA_DEFS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This header file serves to group a set of well known defines and to
+ * set these for each instruction set architecture. These defines may
+ * be divided into two groups; characteristics of the processor and
+ * implementation choices for Solaris on a processor.
+ *
+ * Processor Characteristics:
+ *
+ * _LITTLE_ENDIAN / _BIG_ENDIAN:
+ * The natural byte order of the processor. A pointer to an int points
+ * to the least/most significant byte of that int.
+ *
+ * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD:
+ * The processor specific direction of stack growth. A push onto the
+ * stack increases/decreases the stack pointer, so it stores data at
+ * successively higher/lower addresses. (Stackless machines ignored
+ * without regrets).
+ *
+ * _LONG_LONG_HTOL / _LONG_LONG_LTOH:
+ * A pointer to a long long points to the most/least significant long
+ * within that long long.
+ *
+ * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH:
+ * The C compiler assigns bit fields from the high/low to the low/high end
+ * of an int (most to least significant vs. least to most significant).
+ *
+ * _IEEE_754:
+ * The processor (or supported implementations of the processor)
+ * supports the ieee-754 floating point standard. No other floating
+ * point standards are supported (or significant). Any other supported
+ * floating point formats are expected to be cased on the ISA processor
+ * symbol.
+ *
+ * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED:
+ * The C Compiler implements objects of type `char' as `unsigned' or
+ * `signed' respectively. This is really an implementation choice of
+ * the compiler writer, but it is specified in the ABI and tends to
+ * be uniform across compilers for an instruction set architecture.
+ * Hence, it has the properties of a processor characteristic.
+ *
+ * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT /
+ * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT /
+ * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT:
+ * The ABI defines alignment requirements of each of the primitive
+ * object types. Some, if not all, may be hardware requirements as
+ * well. The values are expressed in "byte-alignment" units.
+ *
+ * _MAX_ALIGNMENT:
+ * The most stringent alignment requirement as specified by the ABI.
+ * Equal to the maximum of all the above _XXX_ALIGNMENT values.
+ *
+ * _ALIGNMENT_REQUIRED:
+ * True or false (1 or 0) whether or not the hardware requires the ABI
+ * alignment.
+ *
+ * _LONG_LONG_ALIGNMENT_32
+ * The 32-bit ABI supported by a 64-bit kernel may have different
+ * alignment requirements for primitive object types. The value of this
+ * identifier is expressed in "byte-alignment" units.
+ *
+ * _HAVE_CPUID_INSN
+ * This indicates that the architecture supports the 'cpuid'
+ * instruction as defined by Intel. (Intel allows other vendors
+ * to extend the instruction for their own purposes.)
+ *
+ *
+ * Implementation Choices:
+ *
+ * _ILP32 / _LP64:
+ * This specifies the compiler data type implementation as specified in
+ * the relevant ABI. The choice between these is strongly influenced
+ * by the underlying hardware, but is not absolutely tied to it.
+ * Currently only two data type models are supported:
+ *
+ * _ILP32:
+ * Int/Long/Pointer are 32 bits. This is the historical UNIX
+ * and Solaris implementation. Due to its historical standing,
+ * this is the default case.
+ *
+ * _LP64:
+ * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen
+ * implementation for 64-bit ABIs such as SPARC V9.
+ *
+ * _I32LPx:
+ * A compilation environment where 'int' is 32-bit, and
+ * longs and pointers are simply the same size.
+ *
+ * In all cases, Char is 8 bits and Short is 16 bits.
+ *
+ * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16:
+ * This specifies the form of the disk VTOC (or label):
+ *
+ * _SUNOS_VTOC_8:
+ * This is a VTOC form which is upwardly compatible with the
+ * SunOS 4.x disk label and allows 8 partitions per disk.
+ *
+ * _SUNOS_VTOC_16:
+ * In this format the incore vtoc image matches the ondisk
+ * version. It allows 16 slices per disk, and is not
+ * compatible with the SunOS 4.x disk label.
+ *
+ * Note that these are not the only two VTOC forms possible and
+ * additional forms may be added. One possible form would be the
+ * SVr4 VTOC form. The symbol for that is reserved now, although
+ * it is not implemented.
+ *
+ * _SVR4_VTOC_16:
+ * This VTOC form is compatible with the System V Release 4
+ * VTOC (as implemented on the SVr4 Intel and 3b ports) with
+ * 16 partitions per disk.
+ *
+ *
+ * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR
+ * This describes the type of addresses used by system DMA:
+ *
+ * _DMA_USES_PHYSADDR:
+ * This type of DMA, used in the x86 implementation,
+ * requires physical addresses for DMA buffers. The 24-bit
+ * addresses used by some legacy boards is the source of the
+ * "low-memory" (<16MB) requirement for some devices using DMA.
+ *
+ * _DMA_USES_VIRTADDR:
+ * This method of DMA allows the use of virtual addresses for
+ * DMA transfers.
+ *
+ * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT
+ * This indicates the presence/absence of an fdisk table.
+ *
+ * _FIRMWARE_NEEDS_FDISK
+ * The fdisk table is required by system firmware. If present,
+ * it allows a disk to be subdivided into multiple fdisk
+ * partitions, each of which is equivalent to a separate,
+ * virtual disk. This enables the co-existence of multiple
+ * operating systems on a shared hard disk.
+ *
+ * _NO_FDISK_PRESENT
+ * If the fdisk table is absent, it is assumed that the entire
+ * media is allocated for a single operating system.
+ *
+ * _HAVE_TEM_FIRMWARE
+ * Defined if this architecture has the (fallback) option of
+ * using prom_* calls for doing I/O if a suitable kernel driver
+ * is not available to do it.
+ *
+ * _DONT_USE_1275_GENERIC_NAMES
+ * Controls whether or not device tree node names should
+ * comply with the IEEE 1275 "Generic Names" Recommended
+ * Practice. With _DONT_USE_GENERIC_NAMES, device-specific
+ * names identifying the particular device will be used.
+ *
+ * __i386_COMPAT
+ * This indicates whether the i386 ABI is supported as a *non-native*
+ * mode for the platform. When this symbol is defined:
+ * - 32-bit xstat-style system calls are enabled
+ * - 32-bit xmknod-style system calls are enabled
+ * - 32-bit system calls use i386 sizes -and- alignments
+ *
+ * Note that this is NOT defined for the i386 native environment!
+ *
+ * __x86
+ * This is ONLY a synonym for defined(__i386) || defined(__amd64)
+ * which is useful only insofar as these two architectures share
+ * common attributes. Analogous to __sparc.
+ *
+ * _PSM_MODULES
+ * This indicates whether or not the implementation uses PSM
+ * modules for processor support, reading /etc/mach from inside
+ * the kernel to extract a list.
+ *
+ * _RTC_CONFIG
+ * This indicates whether or not the implementation uses /etc/rtc_config
+ * to configure the real-time clock in the kernel.
+ *
+ * _UNIX_KRTLD
+ * This indicates that the implementation uses a dynamically
+ * linked unix + krtld to form the core kernel image at boot
+ * time, or (in the absence of this symbol) a prelinked kernel image.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The following set of definitions characterize Solaris on AMD's
+ * 64-bit systems.
+ */
+#if defined(__x86_64) || defined(__amd64)
+
+#if !defined(__amd64)
+#define __amd64 /* preferred guard */
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _LITTLE_ENDIAN
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 8
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 16
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 16
+#define _ALIGNMENT_REQUIRED 1
+
+/*
+ * Different alignment constraints for the i386 ABI in compatibility mode
+ */
+#define _LONG_LONG_ALIGNMENT_32 4
+
+/*
+ * Define the appropriate "implementation choices".
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+#define _MULTI_DATAMODEL
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define __i386_COMPAT
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+/*
+ * The feature test macro __i386 is generic for all processors implementing
+ * the Intel 386 instruction set or a superset of it. Specifically, this
+ * includes all members of the 386, 486, and Pentium family of processors.
+ */
+#elif defined(__i386) || defined(__i386__)
+
+#if !defined(__i386)
+#define __i386
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _LITTLE_ENDIAN
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 4
+#define _LONG_LONG_ALIGNMENT 4
+#define _DOUBLE_ALIGNMENT 4
+#define _DOUBLE_COMPLEX_ALIGNMENT 4
+#define _LONG_DOUBLE_ALIGNMENT 4
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4
+#define _POINTER_ALIGNMENT 4
+#define _MAX_ALIGNMENT 4
+#define _ALIGNMENT_REQUIRED 0
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices".
+ */
+#define _ILP32
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+/*
+ * The following set of definitions characterize the Solaris on SPARC systems.
+ *
+ * The symbol __sparc indicates any of the SPARC family of processor
+ * architectures. This includes SPARC V7, SPARC V8 and SPARC V9.
+ *
+ * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined
+ * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough
+ * to SPARC V8 for the former to be subsumed into the latter definition.)
+ *
+ * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined
+ * by Version 9 of the SPARC Architecture Manual.
+ *
+ * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only
+ * relevant when the symbol __sparc is defined.
+ */
+/*
+ * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added
+ * to support backwards builds. This workaround should be removed in s10_71.
+ */
+#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__)
+#if !defined(__sparc)
+#define __sparc
+#endif
+
+/*
+ * You can be 32-bit or 64-bit, but not both at the same time.
+ */
+#if defined(__sparcv8) && defined(__sparcv9)
+#error "SPARC Versions 8 and 9 are mutually exclusive choices"
+#endif
+
+/*
+ * Existing compilers do not set __sparcv8. Years will transpire before
+ * the compilers can be depended on to set the feature test macro. In
+ * the interim, we'll set it here on the basis of historical behaviour;
+ * if you haven't asked for SPARC V9, then you must've meant SPARC V8.
+ */
+#if !defined(__sparcv9) && !defined(__sparcv8)
+#define __sparcv8
+#endif
+
+/*
+ * Define the appropriate "processor characteristics" shared between
+ * all Solaris on SPARC systems.
+ */
+#define _BIG_ENDIAN
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_HTOL
+#define _BIT_FIELDS_HTOL
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _ALIGNMENT_REQUIRED 1
+
+/*
+ * Define the appropriate "implementation choices" shared between versions.
+ */
+#define _SUNOS_VTOC_8
+#define _DMA_USES_VIRTADDR
+#define _NO_FDISK_PRESENT
+#define _HAVE_TEM_FIRMWARE
+#define _UNIX_KRTLD
+
+/*
+ * The following set of definitions characterize the implementation of
+ * 32-bit Solaris on SPARC V8 systems.
+ */
+#if defined(__sparcv8)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _LONG_ALIGNMENT 4
+#define _LONG_DOUBLE_ALIGNMENT 8
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8
+#define _POINTER_ALIGNMENT 4
+#define _MAX_ALIGNMENT 8
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices"
+ */
+#define _ILP32
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+
+/*
+ * The following set of definitions characterize the implementation of
+ * 64-bit Solaris on SPARC V9 systems.
+ */
+#elif defined(__sparcv9)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _LONG_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 16
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 16
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGMENT
+
+/*
+ * Define the appropriate "implementation choices"
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#if !defined(_I32LPx)
+#define _I32LPx
+#endif
+#define _MULTI_DATAMODEL
+
+#else
+#error "unknown SPARC version"
+#endif
+
+/*
+ * #error is strictly ansi-C, but works as well as anything for K&R systems.
+ */
+#else
+#error "ISA not supported"
+#endif
+
+#if defined(_ILP32) && defined(_LP64)
+#error "Both _ILP32 and _LP64 are defined"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ISA_DEFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/list.h b/sys/contrib/opensolaris/uts/common/sys/list.h
new file mode 100644
index 000000000000..7e9d9aaaf750
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/list.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_LIST_H
+#define _SYS_LIST_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/list_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct list_node list_node_t;
+typedef struct list list_t;
+
+void list_create(list_t *, size_t, size_t);
+void list_destroy(list_t *);
+
+void list_insert_after(list_t *, void *, void *);
+void list_insert_before(list_t *, void *, void *);
+void list_insert_head(list_t *, void *);
+void list_insert_tail(list_t *, void *);
+void list_remove(list_t *, void *);
+void list_move_tail(list_t *, list_t *);
+
+void *list_head(list_t *);
+void *list_tail(list_t *);
+void *list_next(list_t *, void *);
+void *list_prev(list_t *, void *);
+
+int list_link_active(list_node_t *);
+int list_is_empty(list_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LIST_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/list_impl.h b/sys/contrib/opensolaris/uts/common/sys/list_impl.h
new file mode 100644
index 000000000000..9c42f8832023
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/list_impl.h
@@ -0,0 +1,53 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_LIST_IMPL_H
+#define _SYS_LIST_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct list_node {
+ struct list_node *list_next;
+ struct list_node *list_prev;
+};
+
+struct list {
+ size_t list_size;
+ size_t list_offset;
+ struct list_node list_head;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LIST_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/note.h b/sys/contrib/opensolaris/uts/common/sys/note.h
new file mode 100644
index 000000000000..2cb7fd89b7dd
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/note.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1994 by Sun Microsystems, Inc.
+ */
+
+/*
+ * sys/note.h: interface for annotating source with info for tools
+ *
+ * This is the underlying interface; NOTE (/usr/include/note.h) is the
+ * preferred interface, but all exported header files should include this
+ * file directly and use _NOTE so as not to take "NOTE" from the user's
+ * namespace. For consistency, *all* kernel source should use _NOTE.
+ *
+ * By default, annotations expand to nothing. This file implements
+ * that. Tools using annotations will interpose a different version
+ * of this file that will expand annotations as needed.
+ */
+
+#ifndef _SYS_NOTE_H
+#define _SYS_NOTE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _NOTE
+#define _NOTE(s)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_NOTE_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/contrib/opensolaris/uts/common/sys/nvpair.h
new file mode 100644
index 000000000000..306e30f3a77a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/nvpair.h
@@ -0,0 +1,260 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_NVPAIR_H
+#define _SYS_NVPAIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/errno.h>
+
+#if defined(_KERNEL) && !defined(_BOOT)
+#include <sys/kmem.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ DATA_TYPE_UNKNOWN = 0,
+ DATA_TYPE_BOOLEAN,
+ DATA_TYPE_BYTE,
+ DATA_TYPE_INT16,
+ DATA_TYPE_UINT16,
+ DATA_TYPE_INT32,
+ DATA_TYPE_UINT32,
+ DATA_TYPE_INT64,
+ DATA_TYPE_UINT64,
+ DATA_TYPE_STRING,
+ DATA_TYPE_BYTE_ARRAY,
+ DATA_TYPE_INT16_ARRAY,
+ DATA_TYPE_UINT16_ARRAY,
+ DATA_TYPE_INT32_ARRAY,
+ DATA_TYPE_UINT32_ARRAY,
+ DATA_TYPE_INT64_ARRAY,
+ DATA_TYPE_UINT64_ARRAY,
+ DATA_TYPE_STRING_ARRAY,
+ DATA_TYPE_HRTIME,
+ DATA_TYPE_NVLIST,
+ DATA_TYPE_NVLIST_ARRAY,
+ DATA_TYPE_BOOLEAN_VALUE,
+ DATA_TYPE_INT8,
+ DATA_TYPE_UINT8,
+ DATA_TYPE_BOOLEAN_ARRAY,
+ DATA_TYPE_INT8_ARRAY,
+ DATA_TYPE_UINT8_ARRAY
+} data_type_t;
+
+typedef struct nvpair {
+ int32_t nvp_size; /* size of this nvpair */
+ int16_t nvp_name_sz; /* length of name string */
+ int16_t nvp_reserve; /* not used */
+ int32_t nvp_value_elem; /* number of elements for array types */
+ data_type_t nvp_type; /* type of value */
+ /* name string */
+ /* aligned ptr array for string arrays */
+ /* aligned array of data for value */
+} nvpair_t;
+
+/* nvlist header */
+typedef struct nvlist {
+ int32_t nvl_version;
+ uint32_t nvl_nvflag; /* persistent flags */
+ uint64_t nvl_priv; /* ptr to private data if not packed */
+ uint32_t nvl_flag;
+ int32_t nvl_pad; /* currently not used, for alignment */
+} nvlist_t;
+
+/* nvp implementation version */
+#define NV_VERSION 0
+
+/* nvlist pack encoding */
+#define NV_ENCODE_NATIVE 0
+#define NV_ENCODE_XDR 1
+
+/* nvlist persistent unique name flags, stored in nvl_nvflags */
+#define NV_UNIQUE_NAME 0x1
+#define NV_UNIQUE_NAME_TYPE 0x2
+
+/* nvlist lookup pairs related flags */
+#define NV_FLAG_NOENTOK 0x1
+
+/* convenience macros */
+#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul)
+#define NV_ALIGN4(x) (((x) + 3) & ~3)
+
+#define NVP_SIZE(nvp) ((nvp)->nvp_size)
+#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t))
+#define NVP_TYPE(nvp) ((nvp)->nvp_type)
+#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem)
+#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \
+ + (nvp)->nvp_name_sz))
+
+#define NVL_VERSION(nvl) ((nvl)->nvl_version)
+#define NVL_SIZE(nvl) ((nvl)->nvl_size)
+#define NVL_FLAG(nvl) ((nvl)->nvl_flag)
+
+/* NV allocator framework */
+typedef struct nv_alloc_ops nv_alloc_ops_t;
+
+typedef struct nv_alloc {
+ const nv_alloc_ops_t *nva_ops;
+ void *nva_arg;
+} nv_alloc_t;
+
+struct nv_alloc_ops {
+ int (*nv_ao_init)(nv_alloc_t *, __va_list);
+ void (*nv_ao_fini)(nv_alloc_t *);
+ void *(*nv_ao_alloc)(nv_alloc_t *, size_t);
+ void (*nv_ao_free)(nv_alloc_t *, void *, size_t);
+ void (*nv_ao_reset)(nv_alloc_t *);
+};
+
+extern const nv_alloc_ops_t *nv_fixed_ops;
+extern nv_alloc_t *nv_alloc_nosleep;
+
+#if defined(_KERNEL) && !defined(_BOOT)
+extern nv_alloc_t *nv_alloc_sleep;
+#endif
+
+int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...);
+void nv_alloc_reset(nv_alloc_t *);
+void nv_alloc_fini(nv_alloc_t *);
+
+/* list management */
+int nvlist_alloc(nvlist_t **, uint_t, int);
+void nvlist_free(nvlist_t *);
+int nvlist_size(nvlist_t *, size_t *, int);
+int nvlist_pack(nvlist_t *, char **, size_t *, int, int);
+int nvlist_unpack(char *, size_t, nvlist_t **, int);
+int nvlist_dup(nvlist_t *, nvlist_t **, int);
+int nvlist_merge(nvlist_t *, nvlist_t *, int);
+
+int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
+int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
+int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
+int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *);
+nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *);
+
+int nvlist_add_nvpair(nvlist_t *, nvpair_t *);
+int nvlist_add_boolean(nvlist_t *, const char *);
+int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
+int nvlist_add_byte(nvlist_t *, const char *, uchar_t);
+int nvlist_add_int8(nvlist_t *, const char *, int8_t);
+int nvlist_add_uint8(nvlist_t *, const char *, uint8_t);
+int nvlist_add_int16(nvlist_t *, const char *, int16_t);
+int nvlist_add_uint16(nvlist_t *, const char *, uint16_t);
+int nvlist_add_int32(nvlist_t *, const char *, int32_t);
+int nvlist_add_uint32(nvlist_t *, const char *, uint32_t);
+int nvlist_add_int64(nvlist_t *, const char *, int64_t);
+int nvlist_add_uint64(nvlist_t *, const char *, uint64_t);
+int nvlist_add_string(nvlist_t *, const char *, const char *);
+int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
+int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
+int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
+int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
+int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
+int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
+int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
+int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
+int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
+int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
+int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
+int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t);
+int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
+int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t);
+
+int nvlist_remove(nvlist_t *, const char *, data_type_t);
+int nvlist_remove_all(nvlist_t *, const char *);
+
+int nvlist_lookup_boolean(nvlist_t *, const char *);
+int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
+int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *);
+int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *);
+int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *);
+int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *);
+int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *);
+int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *);
+int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *);
+int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *);
+int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *);
+int nvlist_lookup_string(nvlist_t *, const char *, char **);
+int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **);
+int nvlist_lookup_boolean_array(nvlist_t *, const char *,
+ boolean_t **, uint_t *);
+int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *);
+int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *);
+int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *);
+int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *);
+int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *);
+int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *);
+int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *);
+int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *);
+int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *);
+int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *);
+int nvlist_lookup_nvlist_array(nvlist_t *, const char *,
+ nvlist_t ***, uint_t *);
+int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *);
+int nvlist_lookup_pairs(nvlist_t *nvl, int, ...);
+
+/* processing nvpair */
+nvpair_t *nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *);
+char *nvpair_name(nvpair_t *);
+data_type_t nvpair_type(nvpair_t *);
+int nvpair_value_boolean_value(nvpair_t *, boolean_t *);
+int nvpair_value_byte(nvpair_t *, uchar_t *);
+int nvpair_value_int8(nvpair_t *, int8_t *);
+int nvpair_value_uint8(nvpair_t *, uint8_t *);
+int nvpair_value_int16(nvpair_t *, int16_t *);
+int nvpair_value_uint16(nvpair_t *, uint16_t *);
+int nvpair_value_int32(nvpair_t *, int32_t *);
+int nvpair_value_uint32(nvpair_t *, uint32_t *);
+int nvpair_value_int64(nvpair_t *, int64_t *);
+int nvpair_value_uint64(nvpair_t *, uint64_t *);
+int nvpair_value_string(nvpair_t *, char **);
+int nvpair_value_nvlist(nvpair_t *, nvlist_t **);
+int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *);
+int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *);
+int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *);
+int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *);
+int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *);
+int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *);
+int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *);
+int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *);
+int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *);
+int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *);
+int nvpair_value_string_array(nvpair_t *, char ***, uint_t *);
+int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *);
+int nvpair_value_hrtime(nvpair_t *, hrtime_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_NVPAIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h
new file mode 100644
index 000000000000..f12dbbfe6ef5
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _NVPAIR_IMPL_H
+#define _NVPAIR_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/nvpair.h>
+
+/*
+ * The structures here provided for information and debugging purposes only
+ * may be changed in the future.
+ */
+
+/*
+ * implementation linked list for pre-packed data
+ */
+typedef struct i_nvp i_nvp_t;
+
+struct i_nvp {
+ union {
+ uint64_t _nvi_align; /* ensure alignment */
+ struct {
+ i_nvp_t *_nvi_next; /* pointer to next nvpair */
+ i_nvp_t *_nvi_prev; /* pointer to prev nvpair */
+ } _nvi;
+ } _nvi_un;
+ nvpair_t nvi_nvp; /* nvpair */
+};
+#define nvi_next _nvi_un._nvi._nvi_next
+#define nvi_prev _nvi_un._nvi._nvi_prev
+
+typedef struct {
+ i_nvp_t *nvp_list; /* linked list of nvpairs */
+ i_nvp_t *nvp_last; /* last nvpair */
+ i_nvp_t *nvp_curr; /* current walker nvpair */
+ nv_alloc_t *nvp_nva; /* pluggable allocator */
+ uint32_t nvp_stat; /* internal state */
+} nvpriv_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NVPAIR_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/processor.h b/sys/contrib/opensolaris/uts/common/sys/processor.h
new file mode 100644
index 000000000000..063f7dacb27f
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/processor.h
@@ -0,0 +1,146 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ * All Rights Reserved
+ *
+ */
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_PROCESSOR_H
+#define _SYS_PROCESSOR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/procset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Definitions for p_online, processor_info & lgrp system calls.
+ */
+
+/*
+ * Type for an lgrpid
+ */
+typedef uint16_t lgrpid_t;
+
+/*
+ * Type for processor name (CPU number).
+ */
+typedef int processorid_t;
+typedef int chipid_t;
+
+/*
+ * Flags and return values for p_online(2), and pi_state for processor_info(2).
+ * These flags are *not* for in-kernel examination of CPU states.
+ * See <sys/cpuvar.h> for appropriate informational functions.
+ */
+#define P_OFFLINE 0x0001 /* processor is offline, as quiet as possible */
+#define P_ONLINE 0x0002 /* processor is online */
+#define P_STATUS 0x0003 /* value passed to p_online to request status */
+#define P_FAULTED 0x0004 /* processor is offline, in faulted state */
+#define P_POWEROFF 0x0005 /* processor is powered off */
+#define P_NOINTR 0x0006 /* processor is online, but no I/O interrupts */
+#define P_SPARE 0x0007 /* processor is offline, can be reactivated */
+#define P_BAD P_FAULTED /* unused but defined by USL */
+#define P_FORCED 0x10000000 /* force processor offline */
+
+/*
+ * String names for processor states defined above.
+ */
+#define PS_OFFLINE "off-line"
+#define PS_ONLINE "on-line"
+#define PS_FAULTED "faulted"
+#define PS_POWEROFF "powered-off"
+#define PS_NOINTR "no-intr"
+#define PS_SPARE "spare"
+
+/*
+ * Structure filled in by processor_info(2).
+ *
+ * The string fields are guaranteed to contain a NULL.
+ *
+ * The pi_fputypes field contains a (possibly empty) comma-separated
+ * list of floating point identifier strings.
+ */
+#define PI_TYPELEN 16 /* max size of CPU type string */
+#define PI_FPUTYPE 32 /* max size of FPU types string */
+
+typedef struct {
+ int pi_state; /* processor state, see above */
+ char pi_processor_type[PI_TYPELEN]; /* ASCII CPU type */
+ char pi_fputypes[PI_FPUTYPE]; /* ASCII FPU types */
+ int pi_clock; /* CPU clock freq in MHz */
+} processor_info_t;
+
+/*
+ * Binding values for processor_bind(2)
+ */
+#define PBIND_NONE -1 /* LWP/thread is not bound */
+#define PBIND_QUERY -2 /* don't set, just return the binding */
+
+/*
+ * User-level system call interface prototypes
+ */
+#ifndef _KERNEL
+#ifdef __STDC__
+
+extern int p_online(processorid_t processorid, int flag);
+extern int processor_info(processorid_t processorid,
+ processor_info_t *infop);
+extern int processor_bind(idtype_t idtype, id_t id,
+ processorid_t processorid, processorid_t *obind);
+extern processorid_t getcpuid(void);
+extern lgrpid_t gethomelgroup(void);
+
+#else
+
+extern int p_online();
+extern int processor_info();
+extern int processor_bind();
+extern processorid_t getcpuid();
+extern lgrpid_t gethomelgroup();
+
+#endif /* __STDC__ */
+
+#else /* _KERNEL */
+
+/*
+ * Internal interface prototypes
+ */
+extern int p_online_internal(processorid_t, int, int *);
+
+#endif /* !_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_PROCESSOR_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/procset.h b/sys/contrib/opensolaris/uts/common/sys/procset.h
new file mode 100644
index 000000000000..c3b58675746e
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/procset.h
@@ -0,0 +1,160 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ifndef _SYS_PROCSET_H
+#define _SYS_PROCSET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.6 */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/feature_tests.h>
+#include <sys/types.h>
+#include <sys/signal.h>
+
+/*
+ * This file defines the data needed to specify a set of
+ * processes. These types are used by the sigsend, sigsendset,
+ * priocntl, priocntlset, waitid, evexit, and evexitset system
+ * calls.
+ */
+#define P_INITPID 1
+#define P_INITUID 0
+#define P_INITPGID 0
+
+
+/*
+ * The following defines the values for an identifier type. It
+ * specifies the interpretation of an id value. An idtype and
+ * id together define a simple set of processes.
+ */
+typedef enum
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+ idtype /* pollutes XPG4.2 namespace */
+#endif
+ {
+ P_PID, /* A process identifier. */
+ P_PPID, /* A parent process identifier. */
+ P_PGID, /* A process group (job control group) */
+ /* identifier. */
+ P_SID, /* A session identifier. */
+ P_CID, /* A scheduling class identifier. */
+ P_UID, /* A user identifier. */
+ P_GID, /* A group identifier. */
+ P_ALL, /* All processes. */
+ P_LWPID, /* An LWP identifier. */
+ P_TASKID, /* A task identifier. */
+ P_PROJID, /* A project identifier. */
+ P_POOLID, /* A pool identifier. */
+ P_ZONEID, /* A zone identifier. */
+ P_CTID, /* A (process) contract identifier. */
+ P_CPUID, /* CPU identifier. */
+ P_PSETID /* Processor set identifier */
+} idtype_t;
+
+
+/*
+ * The following defines the operations which can be performed to
+ * combine two simple sets of processes to form another set of
+ * processes.
+ */
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+typedef enum idop {
+ POP_DIFF, /* Set difference. The processes which */
+ /* are in the left operand set and not */
+ /* in the right operand set. */
+ POP_AND, /* Set disjunction. The processes */
+ /* which are in both the left and right */
+ /* operand sets. */
+ POP_OR, /* Set conjunction. The processes */
+ /* which are in either the left or the */
+ /* right operand sets (or both). */
+ POP_XOR /* Set exclusive or. The processes */
+ /* which are in either the left or */
+ /* right operand sets but not in both. */
+} idop_t;
+
+
+/*
+ * The following structure is used to define a set of processes.
+ * The set is defined in terms of two simple sets of processes
+ * and an operator which operates on these two operand sets.
+ */
+typedef struct procset {
+ idop_t p_op; /* The operator connection the */
+ /* following two operands each */
+ /* of which is a simple set of */
+ /* processes. */
+
+ idtype_t p_lidtype;
+ /* The type of the left operand */
+ /* simple set. */
+ id_t p_lid; /* The id of the left operand. */
+
+ idtype_t p_ridtype;
+ /* The type of the right */
+ /* operand simple set. */
+ id_t p_rid; /* The id of the right operand. */
+} procset_t;
+
+/*
+ * The following macro can be used to initialize a procset_t
+ * structure.
+ */
+#define setprocset(psp, op, ltype, lid, rtype, rid) \
+ (psp)->p_op = (op); \
+ (psp)->p_lidtype = (ltype); \
+ (psp)->p_lid = (lid); \
+ (psp)->p_ridtype = (rtype); \
+ (psp)->p_rid = (rid);
+
+#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
+
+#ifdef _KERNEL
+
+struct proc;
+
+extern int dotoprocs(procset_t *, int (*)(), char *);
+extern int dotolwp(procset_t *, int (*)(), char *);
+extern int procinset(struct proc *, procset_t *);
+extern int sigsendproc(struct proc *, sigsend_t *);
+extern int sigsendset(procset_t *, sigsend_t *);
+extern boolean_t cur_inset_only(procset_t *);
+extern id_t getmyid(idtype_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_PROCSET_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/sdt.h b/sys/contrib/opensolaris/uts/common/sys/sdt.h
new file mode 100644
index 000000000000..da695c9203c7
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/sdt.h
@@ -0,0 +1,176 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SDT_H
+#define _SYS_SDT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _KERNEL
+
+#define DTRACE_PROBE(provider, name) { \
+ extern void __dtrace_##provider##___##name(void); \
+ __dtrace_##provider##___##name(); \
+}
+
+#define DTRACE_PROBE1(provider, name, arg1) { \
+ extern void __dtrace_##provider##___##name(unsigned long); \
+ __dtrace_##provider##___##name((unsigned long)arg1); \
+}
+
+#define DTRACE_PROBE2(provider, name, arg1, arg2) { \
+ extern void __dtrace_##provider##___##name(unsigned long, \
+ unsigned long); \
+ __dtrace_##provider##___##name((unsigned long)arg1, \
+ (unsigned long)arg2); \
+}
+
+#define DTRACE_PROBE3(provider, name, arg1, arg2, arg3) { \
+ extern void __dtrace_##provider##___##name(unsigned long, \
+ unsigned long, unsigned long); \
+ __dtrace_##provider##___##name((unsigned long)arg1, \
+ (unsigned long)arg2, (unsigned long)arg3); \
+}
+
+#define DTRACE_PROBE4(provider, name, arg1, arg2, arg3, arg4) { \
+ extern void __dtrace_##provider##___##name(unsigned long, \
+ unsigned long, unsigned long, unsigned long); \
+ __dtrace_##provider##___##name((unsigned long)arg1, \
+ (unsigned long)arg2, (unsigned long)arg3, \
+ (unsigned long)arg4); \
+}
+
+#define DTRACE_PROBE5(provider, name, arg1, arg2, arg3, arg4, arg5) { \
+ extern void __dtrace_##provider##___##name(unsigned long, \
+ unsigned long, unsigned long, unsigned long, unsigned long);\
+ __dtrace_##provider##___##name((unsigned long)arg1, \
+ (unsigned long)arg2, (unsigned long)arg3, \
+ (unsigned long)arg4, (unsigned long)arg5); \
+}
+
+#else /* _KERNEL */
+
+#define DTRACE_PROBE(name) { \
+ extern void __dtrace_probe_##name(void); \
+ __dtrace_probe_##name(); \
+}
+
+#define DTRACE_PROBE1(name, type1, arg1) { \
+ extern void __dtrace_probe_##name(uintptr_t); \
+ __dtrace_probe_##name((uintptr_t)(arg1)); \
+}
+
+#define DTRACE_PROBE2(name, type1, arg1, type2, arg2) { \
+ extern void __dtrace_probe_##name(uintptr_t, uintptr_t); \
+ __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2)); \
+}
+
+#define DTRACE_PROBE3(name, type1, arg1, type2, arg2, type3, arg3) { \
+ extern void __dtrace_probe_##name(uintptr_t, uintptr_t, uintptr_t); \
+ __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \
+ (uintptr_t)(arg3)); \
+}
+
+#define DTRACE_PROBE4(name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4) { \
+ extern void __dtrace_probe_##name(uintptr_t, uintptr_t, \
+ uintptr_t, uintptr_t); \
+ __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \
+ (uintptr_t)(arg3), (uintptr_t)(arg4)); \
+}
+
+#define DTRACE_SCHED(name) \
+ DTRACE_PROBE(__sched_##name);
+
+#define DTRACE_SCHED1(name, type1, arg1) \
+ DTRACE_PROBE1(__sched_##name, type1, arg1);
+
+#define DTRACE_SCHED2(name, type1, arg1, type2, arg2) \
+ DTRACE_PROBE2(__sched_##name, type1, arg1, type2, arg2);
+
+#define DTRACE_SCHED3(name, type1, arg1, type2, arg2, type3, arg3) \
+ DTRACE_PROBE3(__sched_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define DTRACE_SCHED4(name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4) \
+ DTRACE_PROBE4(__sched_##name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4);
+
+#define DTRACE_PROC(name) \
+ DTRACE_PROBE(__proc_##name);
+
+#define DTRACE_PROC1(name, type1, arg1) \
+ DTRACE_PROBE1(__proc_##name, type1, arg1);
+
+#define DTRACE_PROC2(name, type1, arg1, type2, arg2) \
+ DTRACE_PROBE2(__proc_##name, type1, arg1, type2, arg2);
+
+#define DTRACE_PROC3(name, type1, arg1, type2, arg2, type3, arg3) \
+ DTRACE_PROBE3(__proc_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define DTRACE_PROC4(name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4) \
+ DTRACE_PROBE4(__proc_##name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4);
+
+#define DTRACE_IO(name) \
+ DTRACE_PROBE(__io_##name);
+
+#define DTRACE_IO1(name, type1, arg1) \
+ DTRACE_PROBE1(__io_##name, type1, arg1);
+
+#define DTRACE_IO2(name, type1, arg1, type2, arg2) \
+ DTRACE_PROBE2(__io_##name, type1, arg1, type2, arg2);
+
+#define DTRACE_IO3(name, type1, arg1, type2, arg2, type3, arg3) \
+ DTRACE_PROBE3(__io_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define DTRACE_IO4(name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4) \
+ DTRACE_PROBE4(__io_##name, type1, arg1, type2, arg2, \
+ type3, arg3, type4, arg4);
+
+#define DTRACE_SYSEVENT2(name, type1, arg1, type2, arg2) \
+ DTRACE_PROBE2(__sysevent_##name, type1, arg1, type2, arg2);
+
+#endif /* _KERNEL */
+
+extern const char *sdt_prefix;
+
+typedef struct sdt_probedesc {
+ char *sdpd_name; /* name of this probe */
+ unsigned long sdpd_offset; /* offset of call in text */
+ struct sdt_probedesc *sdpd_next; /* next static probe */
+} sdt_probedesc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SDT_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/synch.h b/sys/contrib/opensolaris/uts/common/sys/synch.h
new file mode 100644
index 000000000000..8f52d720cb45
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/synch.h
@@ -0,0 +1,161 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYNCH_H
+#define _SYS_SYNCH_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifndef _ASM
+#include <sys/types.h>
+#include <sys/int_types.h>
+#endif /* _ASM */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+/*
+ * Thread and LWP mutexes have the same type
+ * definitions.
+ *
+ * NOTE:
+ *
+ * POSIX requires that <pthread.h> define the structures pthread_mutex_t
+ * and pthread_cond_t. Although these structures are identical to mutex_t
+ * (lwp_mutex_t) and cond_t (lwp_cond_t), defined here, a typedef of these
+ * types would require including <synch.h> in <pthread.h>, pulling in
+ * non-posix symbols/constants, violating POSIX namespace restrictions. Hence,
+ * pthread_mutex_t/pthread_cond_t have been redefined (in <sys/types.h>).
+ * Any modifications done to mutex_t/lwp_mutex_t or cond_t/lwp_cond_t must
+ * also be done to pthread_mutex_t/pthread_cond_t.
+ */
+typedef struct _lwp_mutex {
+ struct {
+ uint16_t flag1;
+ uint8_t flag2;
+ uint8_t ceiling;
+ union {
+ uint16_t bcptype;
+ struct {
+ uint8_t count_type1;
+ uint8_t count_type2;
+ } mtype_rcount;
+ } mbcp_type_un;
+ uint16_t magic;
+ } flags;
+ union {
+ struct {
+ uint8_t pad[8];
+ } lock64;
+ struct {
+ uint32_t ownerpid;
+ uint32_t lockword;
+ } lock32;
+ upad64_t owner64;
+ } lock;
+ upad64_t data;
+} lwp_mutex_t;
+
+/*
+ * Thread and LWP condition variables have the same
+ * type definition.
+ * NOTE:
+ * The layout of the following structure should be kept in sync with the
+ * layout of pthread_cond_t in sys/types.h. See NOTE above for lwp_mutex_t.
+ */
+typedef struct _lwp_cond {
+ struct {
+ uint8_t flag[4];
+ uint16_t type;
+ uint16_t magic;
+ } flags;
+ upad64_t data;
+} lwp_cond_t;
+
+/*
+ * LWP semaphores
+ */
+typedef struct _lwp_sema {
+ uint32_t count; /* semaphore count */
+ uint16_t type;
+ uint16_t magic;
+ uint8_t flags[8]; /* last byte reserved for waiters */
+ upad64_t data; /* optional data */
+} lwp_sema_t;
+
+/*
+ * Thread and LWP rwlocks have the same type definition.
+ * NOTE: The layout of this structure should be kept in sync with the layout
+ * of the correponding structure of pthread_rwlock_t in sys/types.h.
+ * Also, because we have to deal with C++, there is an identical structure
+ * for rwlock_t in head/sync.h that we cannot change.
+ */
+typedef struct _lwp_rwlock {
+ int32_t readers; /* -1 == writer else # of readers */
+ uint16_t type;
+ uint16_t magic;
+ lwp_mutex_t mutex; /* used to indicate ownership */
+ lwp_cond_t readercv; /* unused */
+ lwp_cond_t writercv; /* unused */
+} lwp_rwlock_t;
+
+#endif /* _ASM */
+/*
+ * Definitions of synchronization types.
+ */
+#define USYNC_THREAD 0x00 /* private to a process */
+#define USYNC_PROCESS 0x01 /* shared by processes */
+
+/* Keep the following 3 fields in sync with pthread.h */
+#define LOCK_NORMAL 0x00 /* same as USYNC_THREAD */
+#define LOCK_ERRORCHECK 0x02 /* error check lock */
+#define LOCK_RECURSIVE 0x04 /* recursive lock */
+
+#define USYNC_PROCESS_ROBUST 0x08 /* shared by processes robustly */
+
+/* Keep the following 5 fields in sync with pthread.h */
+
+#define LOCK_PRIO_NONE 0x00
+#define LOCK_PRIO_INHERIT 0x10
+#define LOCK_PRIO_PROTECT 0x20
+#define LOCK_STALL_NP 0x00
+#define LOCK_ROBUST_NP 0x40
+
+/*
+ * lwp_mutex_t flags
+ */
+#define LOCK_OWNERDEAD 0x1
+#define LOCK_NOTRECOVERABLE 0x2
+#define LOCK_INITED 0x4
+#define LOCK_UNMAPPED 0x8
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYNCH_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/contrib/opensolaris/uts/common/sys/sysevent.h
new file mode 100644
index 000000000000..0a61e41de849
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/sysevent.h
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSEVENT_H
+#define _SYS_SYSEVENT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/nvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NULL
+#if defined(_LP64) && !defined(__cplusplus)
+#define NULL 0L
+#else
+#define NULL 0
+#endif
+#endif
+
+/* Internal registration class and subclass */
+#define EC_ALL "register_all_classes"
+#define EC_SUB_ALL "register_all_subclasses"
+
+/*
+ * Event allocation/enqueuing sleep/nosleep flags
+ */
+#define SE_SLEEP 0
+#define SE_NOSLEEP 1
+
+/* Framework error codes */
+#define SE_EINVAL 1 /* Invalid argument */
+#define SE_ENOMEM 2 /* Unable to allocate memory */
+#define SE_EQSIZE 3 /* Maximum event q size exceeded */
+#define SE_EFAULT 4 /* Copy fault */
+#define SE_NOTFOUND 5 /* Attribute not found */
+#define SE_NO_TRANSPORT 6 /* sysevent transport down */
+
+/* Internal data types */
+
+#define SE_DATA_TYPE_BYTE DATA_TYPE_BYTE
+#define SE_DATA_TYPE_INT16 DATA_TYPE_INT16
+#define SE_DATA_TYPE_UINT16 DATA_TYPE_UINT16
+#define SE_DATA_TYPE_INT32 DATA_TYPE_INT32
+#define SE_DATA_TYPE_UINT32 DATA_TYPE_UINT32
+#define SE_DATA_TYPE_INT64 DATA_TYPE_INT64
+#define SE_DATA_TYPE_UINT64 DATA_TYPE_UINT64
+#define SE_DATA_TYPE_STRING DATA_TYPE_STRING
+#define SE_DATA_TYPE_BYTES DATA_TYPE_BYTE_ARRAY
+#define SE_DATA_TYPE_TIME DATA_TYPE_HRTIME
+
+#define SE_KERN_PID 0
+
+#define SUNW_VENDOR "SUNW"
+#define SE_USR_PUB "usr:"
+#define SE_KERN_PUB "kern:"
+#define SUNW_KERN_PUB SUNW_VENDOR":"SE_KERN_PUB
+#define SUNW_USR_PUB SUNW_VENDOR":"SE_USR_PUB
+
+/*
+ * Event header and attribute value limits
+ */
+#define MAX_ATTR_NAME 1024
+#define MAX_STRING_SZ 1024
+#define MAX_BYTE_ARRAY 1024
+
+#define MAX_CLASS_LEN 64
+#define MAX_SUBCLASS_LEN 64
+#define MAX_PUB_LEN 128
+#define MAX_CHNAME_LEN 128
+#define MAX_SUBID_LEN 16
+
+/*
+ * Limit for the event payload size
+ */
+#define MAX_EV_SIZE_LEN (SHRT_MAX/4)
+
+/* Opaque sysevent_t data type */
+typedef void *sysevent_t;
+
+/* Opaque channel bind data type */
+typedef void evchan_t;
+
+/* sysevent attribute list */
+typedef nvlist_t sysevent_attr_list_t;
+
+/* sysevent attribute name-value pair */
+typedef nvpair_t sysevent_attr_t;
+
+/* Unique event identifier */
+typedef struct sysevent_id {
+ uint64_t eid_seq;
+ hrtime_t eid_ts;
+} sysevent_id_t;
+
+/* Event attribute value structures */
+typedef struct sysevent_bytes {
+ int32_t size;
+ uchar_t *data;
+} sysevent_bytes_t;
+
+typedef struct sysevent_value {
+ int32_t value_type; /* data type */
+ union {
+ uchar_t sv_byte;
+ int16_t sv_int16;
+ uint16_t sv_uint16;
+ int32_t sv_int32;
+ uint32_t sv_uint32;
+ int64_t sv_int64;
+ uint64_t sv_uint64;
+ hrtime_t sv_time;
+ char *sv_string;
+ sysevent_bytes_t sv_bytes;
+ } value;
+} sysevent_value_t;
+
+/*
+ * The following flags determine the memory allocation semantics to use for
+ * kernel event buffer allocation by userland and kernel versions of
+ * sysevent_evc_publish().
+ *
+ * EVCH_SLEEP and EVCH_NOSLEEP respectively map to KM_SLEEP and KM_NOSLEEP.
+ * EVCH_TRYHARD is a kernel-only publish flag that allow event allocation
+ * routines to use use alternate kmem caches in situations where free memory
+ * may be low. Kernel callers of sysevent_evc_publish() must set flags to
+ * one of EVCH_SLEEP, EVCH_NOSLEEP or EVCH_TRYHARD. Userland callers of
+ * sysevent_evc_publish() must set flags to one of EVCH_SLEEP or EVCH_NOSLEEP.
+ *
+ * EVCH_QWAIT determines whether or not we should wait for slots in the event
+ * queue at publication time. EVCH_QWAIT may be used by kernel and userland
+ * publishers and must be used in conjunction with any of one of EVCH_SLEEP,
+ * EVCH_NOSLEEP or EVCH_TRYHARD (kernel-only).
+ */
+
+#define EVCH_NOSLEEP 0x0001 /* No sleep on kmem_alloc() */
+#define EVCH_SLEEP 0x0002 /* Sleep on kmem_alloc() */
+#define EVCH_TRYHARD 0x0004 /* May use alternate kmem cache for alloc */
+#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */
+
+/*
+ * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to
+ * the consolidation private interface.
+ */
+#define EVCH_SUB_KEEP 0x0001
+#define EVCH_ALLSUB "all_subs"
+
+/*
+ * Meaning of flags parameter of channel bind function
+ */
+#define EVCH_CREAT 0x0001 /* Create a channel if not present */
+#define EVCH_HOLD_PEND 0x0002
+#define EVCH_B_FLAGS 0x0003 /* All valid bits */
+
+/*
+ * Meaning of commands of evc_control function
+ */
+#define EVCH_GET_CHAN_LEN_MAX 1 /* Get event queue length limit */
+#define EVCH_GET_CHAN_LEN 2 /* Get event queue length */
+#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */
+#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */
+
+/*
+ * Event channel interface definitions
+ */
+int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
+void sysevent_evc_unbind(evchan_t *);
+int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
+ int (*)(sysevent_t *, void *), void *, uint32_t);
+void sysevent_evc_unsubscribe(evchan_t *, const char *);
+int sysevent_evc_publish(evchan_t *, const char *, const char *,
+ const char *, const char *, nvlist_t *, uint32_t);
+int sysevent_evc_control(evchan_t *, int, ...);
+
+#ifdef _KERNEL
+
+/*
+ * Kernel log_event interfaces.
+ */
+int log_sysevent(sysevent_t *, int, sysevent_id_t *);
+
+sysevent_t *sysevent_alloc(char *, char *, char *, int);
+void sysevent_free(sysevent_t *);
+int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int);
+void sysevent_free_attr(sysevent_attr_list_t *);
+int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
+void sysevent_detach_attributes(sysevent_t *);
+char *sysevent_get_class_name(sysevent_t *);
+char *sysevent_get_subclass_name(sysevent_t *);
+uint64_t sysevent_get_seq(sysevent_t *);
+void sysevent_get_time(sysevent_t *, hrtime_t *);
+size_t sysevent_get_size(sysevent_t *);
+char *sysevent_get_pub(sysevent_t *);
+int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/contrib/opensolaris/uts/common/sys/sysmacros.h
new file mode 100644
index 000000000000..d77208488cea
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/sysmacros.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSMACROS_H
+#define _SYS_SYSMACROS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Some macros for units conversion
+ */
+/*
+ * Disk blocks (sectors) and bytes.
+ */
+#define dtob(DD) ((DD) << DEV_BSHIFT)
+#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
+#define btodt(BB) ((BB) >> DEV_BSHIFT)
+#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
+
+/* common macros */
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) < 0 ? -(a) : (a))
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * Convert a single byte to/from binary-coded decimal (BCD).
+ */
+extern unsigned char byte_to_bcd[256];
+extern unsigned char bcd_to_byte[256];
+
+#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff]
+#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff]
+
+#endif /* _KERNEL */
+
+/*
+ * WARNING: The device number macros defined here should not be used by device
+ * drivers or user software. Device drivers should use the device functions
+ * defined in the DDI/DKI interface (see also ddi.h). Application software
+ * should make use of the library routines available in makedev(3). A set of
+ * new device macros are provided to operate on the expanded device number
+ * format supported in SVR4. Macro versions of the DDI device functions are
+ * provided for use by kernel proper routines only. Macro routines bmajor(),
+ * major(), minor(), emajor(), eminor(), and makedev() will be removed or
+ * their definitions changed at the next major release following SVR4.
+ */
+
+#define O_BITSMAJOR 7 /* # of SVR3 major device bits */
+#define O_BITSMINOR 8 /* # of SVR3 minor device bits */
+#define O_MAXMAJ 0x7f /* SVR3 max major value */
+#define O_MAXMIN 0xff /* SVR3 max minor value */
+
+
+#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */
+#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */
+#define L_MAXMAJ32 0x3fff /* SVR4 max major value */
+#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */
+ /* For 3b2 hardware devices the minor is */
+ /* restricted to 256 (0-255) */
+
+#ifdef _LP64
+#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */
+#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */
+#define L_MAXMAJ 0xfffffffful /* max major value */
+#define L_MAXMIN 0xfffffffful /* max minor value */
+#else
+#define L_BITSMAJOR L_BITSMAJOR32
+#define L_BITSMINOR L_BITSMINOR32
+#define L_MAXMAJ L_MAXMAJ32
+#define L_MAXMIN L_MAXMIN32
+#endif
+
+#ifdef _KERNEL
+
+/* major part of a device internal to the kernel */
+
+#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+#define bmajor(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+
+/* get internal major part of expanded device number */
+
+#define getmajor(x) (major_t)((((dev_t)(x)) >> L_BITSMINOR) & L_MAXMAJ)
+
+/* minor part of a device internal to the kernel */
+
+#define minor(x) (minor_t)((x) & O_MAXMIN)
+
+/* get internal minor part of expanded device number */
+
+#define getminor(x) (minor_t)((x) & L_MAXMIN)
+
+#else
+
+/* major part of a device external from the kernel (same as emajor below) */
+
+#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+
+/* minor part of a device external from the kernel (same as eminor below) */
+
+#define minor(x) (minor_t)((x) & O_MAXMIN)
+
+#endif /* _KERNEL */
+
+/* create old device number */
+
+#define makedev(x, y) (unsigned short)(((x) << O_BITSMINOR) | ((y) & O_MAXMIN))
+
+/* make an new device number */
+
+#define makedevice(x, y) (dev_t)(((dev_t)(x) << L_BITSMINOR) | ((y) & L_MAXMIN))
+
+
+/*
+ * emajor() allows kernel/driver code to print external major numbers
+ * eminor() allows kernel/driver code to print external minor numbers
+ */
+
+#define emajor(x) \
+ (major_t)(((unsigned int)(x) >> O_BITSMINOR) > O_MAXMAJ) ? \
+ NODEV : (((unsigned int)(x) >> O_BITSMINOR) & O_MAXMAJ)
+
+#define eminor(x) \
+ (minor_t)((x) & O_MAXMIN)
+
+/*
+ * get external major and minor device
+ * components from expanded device number
+ */
+#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
+ NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
+#define geteminor(x) (minor_t)((x) & L_MAXMIN)
+
+/*
+ * These are versions of the kernel routines for compressing and
+ * expanding long device numbers that don't return errors.
+ */
+#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR)
+
+#define DEVCMPL(x) (x)
+#define DEVEXPL(x) (x)
+
+#else
+
+#define DEVCMPL(x) \
+ (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \
+ ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \
+ ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32)))
+
+#define DEVEXPL(x) \
+ (((x) == NODEV32) ? NODEV : \
+ makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32))
+
+#endif /* L_BITSMAJOR32 ... */
+
+/* convert to old (SVR3.2) dev format */
+
+#define cmpdev(x) \
+ (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \
+ ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \
+ ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN)))
+
+/* convert to new (SVR4) dev format */
+
+#define expdev(x) \
+ (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \
+ ((x) & O_MAXMIN))
+
+/*
+ * Macro for checking power of 2 address alignment.
+ */
+#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
+
+/*
+ * Macros for counting and rounding.
+ */
+#define howmany(x, y) (((x)+((y)-1))/(y))
+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
+
+/*
+ * Macro to determine if value is a power of 2
+ */
+#define ISP2(x) (((x) & ((x) - 1)) == 0)
+
+/*
+ * Macros for various sorts of alignment and rounding when the alignment
+ * is known to be a power of 2.
+ */
+#define P2ALIGN(x, align) ((x) & -(align))
+#define P2PHASE(x, align) ((x) & ((align) - 1))
+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
+#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
+#define P2END(x, align) (-(~(x) & -(align)))
+#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
+#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
+/*
+ * Determine whether two numbers have the same high-order bit.
+ */
+#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y)))
+
+/*
+ * Typed version of the P2* macros. These macros should be used to ensure
+ * that the result is correctly calculated based on the data type of (x),
+ * which is passed in as the last argument, regardless of the data
+ * type of the alignment. For example, if (x) is of type uint64_t,
+ * and we want to round it up to a page boundary using "PAGESIZE" as
+ * the alignment, we can do either
+ * P2ROUNDUP(x, (uint64_t)PAGESIZE)
+ * or
+ * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
+ */
+#define P2ALIGN_TYPED(x, align, type) \
+ ((type)(x) & -(type)(align))
+#define P2PHASE_TYPED(x, align, type) \
+ ((type)(x) & ((type)(align) - 1))
+#define P2NPHASE_TYPED(x, align, type) \
+ (-(type)(x) & ((type)(align) - 1))
+#define P2ROUNDUP_TYPED(x, align, type) \
+ (-(-(type)(x) & -(type)(align)))
+#define P2END_TYPED(x, align, type) \
+ (-(~(type)(x) & -(type)(align)))
+#define P2PHASEUP_TYPED(x, align, phase, type) \
+ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
+#define P2CROSS_TYPED(x, y, align, type) \
+ (((type)(x) ^ (type)(y)) > (type)(align) - 1)
+#define P2SAMEHIGHBIT_TYPED(x, y, type) \
+ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
+
+/*
+ * Macros to atomically increment/decrement a variable. mutex and var
+ * must be pointers.
+ */
+#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex)
+#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex)
+
+#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof)
+
+/* avoid any possibility of clashing with <stddef.h> version */
+
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSMACROS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/vfs.h b/sys/contrib/opensolaris/uts/common/sys/vfs.h
new file mode 100644
index 000000000000..0834cf1f8453
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/vfs.h
@@ -0,0 +1,569 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ifndef _SYS_VFS_H
+#define _SYS_VFS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/statvfs.h>
+#include <sys/refstr.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Data associated with mounted file systems.
+ */
+
+/*
+ * Operations vector. This is used internal to the kernel; file systems
+ * supply their list of operations via vfs_setfsops().
+ */
+
+typedef struct vfsops vfsops_t;
+
+/*
+ * File system identifier. Should be unique (at least per machine).
+ */
+typedef struct {
+ int val[2]; /* file system id type */
+} fsid_t;
+
+/*
+ * File identifier. Should be unique per filesystem on a single
+ * machine. This is typically called by a stateless file server
+ * in order to generate "file handles".
+ *
+ * Do not change the definition of struct fid ... fid_t without
+ * letting the CacheFS group know about it! They will have to do at
+ * least two things, in the same change that changes this structure:
+ * 1. change CFSVERSION in usr/src/uts/common/sys/fs/cachefs_fs.h
+ * 2. put the old version # in the canupgrade array
+ * in cachfs_upgrade() in usr/src/cmd/fs.d/cachefs/fsck/fsck.c
+ * This is necessary because CacheFS stores FIDs on disk.
+ *
+ * Many underlying file systems cast a struct fid into other
+ * file system dependent structures which may require 4 byte alignment.
+ * Because a fid starts with a short it may not be 4 byte aligned, the
+ * fid_pad will force the alignment.
+ */
+#define MAXFIDSZ 64
+#define OLD_MAXFIDSZ 16
+
+typedef struct fid {
+ union {
+ long fid_pad;
+ struct {
+ ushort_t len; /* length of data in bytes */
+ char data[MAXFIDSZ]; /* data (variable len) */
+ } _fid;
+ } un;
+} fid_t;
+
+#ifdef _SYSCALL32
+/*
+ * Solaris 64 - use old-style cache format with 32-bit aligned fid for on-disk
+ * struct compatibility.
+ */
+typedef struct fid32 {
+ union {
+ int32_t fid_pad;
+ struct {
+ uint16_t len; /* length of data in bytes */
+ char data[MAXFIDSZ]; /* data (variable len) */
+ } _fid;
+ } un;
+} fid32_t;
+#else /* not _SYSCALL32 */
+#define fid32 fid
+typedef fid_t fid32_t;
+#endif /* _SYSCALL32 */
+
+#define fid_len un._fid.len
+#define fid_data un._fid.data
+
+/*
+ * Structure defining a mount option for a filesystem.
+ * option names are found in mntent.h
+ */
+typedef struct mntopt {
+ char *mo_name; /* option name */
+ char **mo_cancel; /* list of options cancelled by this one */
+ char *mo_arg; /* argument string for this option */
+ int mo_flags; /* flags for this mount option */
+ void *mo_data; /* filesystem specific data */
+} mntopt_t;
+
+/*
+ * Flags that apply to mount options
+ */
+
+#define MO_SET 0x01 /* option is set */
+#define MO_NODISPLAY 0x02 /* option not listed in mnttab */
+#define MO_HASVALUE 0x04 /* option takes a value */
+#define MO_IGNORE 0x08 /* option ignored by parser */
+#define MO_DEFAULT MO_SET /* option is on by default */
+#define MO_TAG 0x10 /* flags a tag set by user program */
+#define MO_EMPTY 0x20 /* empty space in option table */
+
+#define VFS_NOFORCEOPT 0x01 /* honor MO_IGNORE (don't set option) */
+#define VFS_DISPLAY 0x02 /* Turn off MO_NODISPLAY bit for opt */
+#define VFS_NODISPLAY 0x04 /* Turn on MO_NODISPLAY bit for opt */
+#define VFS_CREATEOPT 0x08 /* Create the opt if it's not there */
+
+/*
+ * Structure holding mount option strings for the mounted file system.
+ */
+typedef struct mntopts {
+ uint_t mo_count; /* number of entries in table */
+ mntopt_t *mo_list; /* list of mount options */
+} mntopts_t;
+
+/*
+ * The kstat structures associated with the vopstats are kept in an
+ * AVL tree. This is to avoid the case where a file system does not
+ * use a unique fsid_t for each vfs (e.g., namefs). In order to do
+ * this, we need a structure that the AVL tree can use that also
+ * references the kstat.
+ * Note that the vks_fsid is generated from the value reported by
+ * VFS_STATVFS().
+ */
+typedef struct vskstat_anchor {
+ avl_node_t vsk_node; /* Required for use by AVL routines */
+ kstat_t *vsk_ksp; /* kstat structure for vopstats */
+ ulong_t vsk_fsid; /* fsid associated w/this FS */
+} vsk_anchor_t;
+
+extern avl_tree_t vskstat_tree;
+extern kmutex_t vskstat_tree_lock;
+
+/*
+ * Structure per mounted file system. Each mounted file system has
+ * an array of operations and an instance record.
+ *
+ * The file systems are kept on a doubly linked circular list headed by
+ * "rootvfs".
+ * File system implementations should not access this list;
+ * it's intended for use only in the kernel's vfs layer.
+ *
+ * Each zone also has its own list of mounts, containing filesystems mounted
+ * somewhere within the filesystem tree rooted at the zone's rootpath. The
+ * list is doubly linked to match the global list.
+ *
+ * mnttab locking: the in-kernel mnttab uses the vfs_mntpt, vfs_resource and
+ * vfs_mntopts fields in the vfs_t. mntpt and resource are refstr_ts that
+ * are set at mount time and can only be modified during a remount.
+ * It is safe to read these fields if you can prevent a remount on the vfs,
+ * or through the convenience funcs vfs_getmntpoint() and vfs_getresource().
+ * The mntopts field may only be accessed through the provided convenience
+ * functions, as it is protected by the vfs list lock. Modifying a mount
+ * option requires grabbing the vfs list write lock, which can be a very
+ * high latency lock.
+ */
+struct zone; /* from zone.h */
+struct fem_head; /* from fem.h */
+
+/*
+ * Private vfs data, NOT to be used by a file system implementation.
+ */
+typedef struct vfs_impl {
+ struct fem_head *vi_femhead; /* fs monitoring */
+ /*
+ * Support for statistics on the vnode operations
+ */
+ vsk_anchor_t *vi_vskap; /* anchor for vopstats' kstat */
+ vopstats_t *vi_fstypevsp; /* ptr to per-fstype vopstats */
+ vopstats_t vi_vopstats; /* per-mount vnode op stats */
+} vfs_impl_t;
+
+typedef struct vfs {
+ struct vfs *vfs_next; /* next VFS in VFS list */
+ struct vfs *vfs_prev; /* prev VFS in VFS list */
+
+/* vfs_op should not be used directly. Accessor functions are provided */
+ vfsops_t *vfs_op; /* operations on VFS */
+
+ struct vnode *vfs_vnodecovered; /* vnode mounted on */
+ uint_t vfs_flag; /* flags */
+ uint_t vfs_bsize; /* native block size */
+ int vfs_fstype; /* file system type index */
+ fsid_t vfs_fsid; /* file system id */
+ void *vfs_data; /* private data */
+ dev_t vfs_dev; /* device of mounted VFS */
+ ulong_t vfs_bcount; /* I/O count (accounting) */
+ struct vfs *vfs_list; /* sync list pointer */
+ struct vfs *vfs_hash; /* hash list pointer */
+ ksema_t vfs_reflock; /* mount/unmount/sync lock */
+ uint_t vfs_count; /* vfs reference count */
+ mntopts_t vfs_mntopts; /* options mounted with */
+ refstr_t *vfs_resource; /* mounted resource name */
+ refstr_t *vfs_mntpt; /* mount point name */
+ time_t vfs_mtime; /* time we were mounted */
+ vfs_impl_t *vfs_implp; /* impl specific data */
+ /*
+ * Zones support. Note that the zone that "owns" the mount isn't
+ * necessarily the same as the zone in which the zone is visible.
+ * That is, vfs_zone and (vfs_zone_next|vfs_zone_prev) may refer to
+ * different zones.
+ */
+ struct zone *vfs_zone; /* zone that owns the mount */
+ struct vfs *vfs_zone_next; /* next VFS visible in zone */
+ struct vfs *vfs_zone_prev; /* prev VFS visible in zone */
+} vfs_t;
+
+#define vfs_femhead vfs_implp->vi_femhead
+#define vfs_vskap vfs_implp->vi_vskap
+#define vfs_fstypevsp vfs_implp->vi_fstypevsp
+#define vfs_vopstats vfs_implp->vi_vopstats
+
+/*
+ * VFS flags.
+ */
+#define VFS_RDONLY 0x01 /* read-only vfs */
+#define VFS_NOMNTTAB 0x02 /* vfs not seen in mnttab */
+#define VFS_NOSETUID 0x08 /* setuid disallowed */
+#define VFS_REMOUNT 0x10 /* modify mount options only */
+#define VFS_NOTRUNC 0x20 /* does not truncate long file names */
+#define VFS_UNLINKABLE 0x40 /* unlink(2) can be applied to root */
+#define VFS_PXFS 0x80 /* clustering: global fs proxy vfs */
+#define VFS_UNMOUNTED 0x100 /* file system has been unmounted */
+#define VFS_NBMAND 0x200 /* allow non-blocking mandatory locks */
+#define VFS_XATTR 0x400 /* fs supports extended attributes */
+#define VFS_NODEVICES 0x800 /* device-special files disallowed */
+#define VFS_NOEXEC 0x1000 /* executables disallowed */
+#define VFS_STATS 0x2000 /* file system can collect stats */
+
+#define VFS_NORESOURCE "unspecified_resource"
+#define VFS_NOMNTPT "unspecified_mountpoint"
+
+/*
+ * Argument structure for mount(2).
+ *
+ * Flags are defined in <sys/mount.h>.
+ *
+ * Note that if the MS_SYSSPACE bit is set in flags, the pointer fields in
+ * this structure are to be interpreted as kernel addresses. File systems
+ * should be prepared for this possibility.
+ */
+struct mounta {
+ char *spec;
+ char *dir;
+ int flags;
+ char *fstype;
+ char *dataptr;
+ int datalen;
+ char *optptr;
+ int optlen;
+};
+
+/*
+ * Reasons for calling the vfs_mountroot() operation.
+ */
+enum whymountroot { ROOT_INIT, ROOT_REMOUNT, ROOT_UNMOUNT};
+typedef enum whymountroot whymountroot_t;
+
+/*
+ * Reasons for calling the VFS_VNSTATE():
+ */
+enum vntrans {
+ VNTRANS_EXISTS,
+ VNTRANS_IDLED,
+ VNTRANS_RECLAIMED,
+ VNTRANS_DESTROYED
+};
+typedef enum vntrans vntrans_t;
+
+/*
+ * VFS_OPS defines all the vfs operations. It is used to define
+ * the vfsops structure (below) and the fs_func_p union (vfs_opreg.h).
+ */
+#define VFS_OPS \
+ int (*vfs_mount)(vfs_t *, vnode_t *, struct mounta *, cred_t *); \
+ int (*vfs_unmount)(vfs_t *, int, cred_t *); \
+ int (*vfs_root)(vfs_t *, vnode_t **); \
+ int (*vfs_statvfs)(vfs_t *, statvfs64_t *); \
+ int (*vfs_sync)(vfs_t *, short, cred_t *); \
+ int (*vfs_vget)(vfs_t *, vnode_t **, fid_t *); \
+ int (*vfs_mountroot)(vfs_t *, enum whymountroot); \
+ void (*vfs_freevfs)(vfs_t *); \
+ int (*vfs_vnstate)(vfs_t *, vnode_t *, vntrans_t) /* NB: No ";" */
+
+/*
+ * Operations supported on virtual file system.
+ */
+struct vfsops {
+ VFS_OPS; /* Signature of all vfs operations (vfsops) */
+};
+
+extern int fsop_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+extern int fsop_unmount(vfs_t *, int, cred_t *);
+extern int fsop_root(vfs_t *, vnode_t **);
+extern int fsop_statfs(vfs_t *, statvfs64_t *);
+extern int fsop_sync(vfs_t *, short, cred_t *);
+extern int fsop_vget(vfs_t *, vnode_t **, fid_t *);
+extern int fsop_mountroot(vfs_t *, enum whymountroot);
+extern void fsop_freefs(vfs_t *);
+extern int fsop_sync_by_kind(int, short, cred_t *);
+extern int fsop_vnstate(vfs_t *, vnode_t *, vntrans_t);
+
+#define VFS_MOUNT(vfsp, mvp, uap, cr) fsop_mount(vfsp, mvp, uap, cr)
+#define VFS_UNMOUNT(vfsp, flag, cr) fsop_unmount(vfsp, flag, cr)
+#define VFS_ROOT(vfsp, vpp) fsop_root(vfsp, vpp)
+#define VFS_STATVFS(vfsp, sp) fsop_statfs(vfsp, sp)
+#define VFS_SYNC(vfsp, flag, cr) fsop_sync(vfsp, flag, cr)
+#define VFS_VGET(vfsp, vpp, fidp) fsop_vget(vfsp, vpp, fidp)
+#define VFS_MOUNTROOT(vfsp, init) fsop_mountroot(vfsp, init)
+#define VFS_FREEVFS(vfsp) fsop_freefs(vfsp)
+#define VFS_VNSTATE(vfsp, vn, ns) fsop_vnstate(vfsp, vn, ns)
+
+#define VFSNAME_MOUNT "mount"
+#define VFSNAME_UNMOUNT "unmount"
+#define VFSNAME_ROOT "root"
+#define VFSNAME_STATVFS "statvfs"
+#define VFSNAME_SYNC "sync"
+#define VFSNAME_VGET "vget"
+#define VFSNAME_MOUNTROOT "mountroot"
+#define VFSNAME_FREEVFS "freevfs"
+#define VFSNAME_VNSTATE "vnstate"
+/*
+ * Filesystem type switch table.
+ */
+
+typedef struct vfssw {
+ char *vsw_name; /* type name -- max len _ST_FSTYPSZ */
+ int (*vsw_init) (int, char *);
+ /* init routine (for non-loadable fs only) */
+ int vsw_flag; /* flags */
+ mntopts_t vsw_optproto; /* mount options table prototype */
+ uint_t vsw_count; /* count of references */
+ kmutex_t vsw_lock; /* lock to protect vsw_count */
+ vfsops_t vsw_vfsops; /* filesystem operations vector */
+} vfssw_t;
+
+/*
+ * Filesystem type definition record. All file systems must export a record
+ * of this type through their modlfs structure.
+ */
+
+typedef struct vfsdef_v3 {
+ int def_version; /* structure version, must be first */
+ char *name; /* filesystem type name */
+ int (*init) (int, char *); /* init routine */
+ int flags; /* filesystem flags */
+ mntopts_t *optproto; /* mount options table prototype */
+} vfsdef_v3;
+
+typedef struct vfsdef_v3 vfsdef_t;
+
+enum {
+ VFSDEF_VERSION = 3
+};
+
+/*
+ * flags for vfssw and vfsdef
+ */
+#define VSW_HASPROTO 0x01 /* struct has a mount options prototype */
+#define VSW_CANRWRO 0x02 /* file system can transition from rw to ro */
+#define VSW_CANREMOUNT 0x04 /* file system supports remounts */
+#define VSW_NOTZONESAFE 0x08 /* zone_enter(2) should fail for these files */
+#define VSW_VOLATILEDEV 0x10 /* vfs_dev can change each time fs is mounted */
+#define VSW_STATS 0x20 /* file system can collect stats */
+
+#define VSW_INSTALLED 0x8000 /* this vsw is associated with a file system */
+
+#if defined(_KERNEL)
+/*
+ * Public operations.
+ */
+struct umounta;
+struct statvfsa;
+struct fstatvfsa;
+
+void vfs_freevfsops(vfsops_t *);
+int vfs_freevfsops_by_type(int);
+void vfs_setops(vfs_t *, vfsops_t *);
+vfsops_t *vfs_getops(vfs_t *vfsp);
+int vfs_matchops(vfs_t *, vfsops_t *);
+int vfs_can_sync(vfs_t *vfsp);
+void vfs_init(vfs_t *vfsp, vfsops_t *, void *);
+void vfsimpl_setup(vfs_t *vfsp);
+void vfsimpl_teardown(vfs_t *vfsp);
+void vn_exists(vnode_t *);
+void vn_idle(vnode_t *);
+void vn_reclaim(vnode_t *);
+void vn_invalid(vnode_t *);
+
+int rootconf(void);
+int svm_rootconf(void);
+int domount(char *, struct mounta *, vnode_t *, struct cred *,
+ struct vfs **);
+int dounmount(struct vfs *, int, cred_t *);
+int vfs_lock(struct vfs *);
+int vfs_rlock(struct vfs *);
+void vfs_lock_wait(struct vfs *);
+void vfs_rlock_wait(struct vfs *);
+void vfs_unlock(struct vfs *);
+int vfs_lock_held(struct vfs *);
+struct _kthread *vfs_lock_owner(struct vfs *);
+void sync(void);
+void vfs_sync(int);
+void vfs_mountroot(void);
+void vfs_add(vnode_t *, struct vfs *, int);
+void vfs_remove(struct vfs *);
+
+/* The following functions are not for general use by filesystems */
+
+void vfs_createopttbl(mntopts_t *, const char *);
+void vfs_copyopttbl(const mntopts_t *, mntopts_t *);
+void vfs_mergeopttbl(const mntopts_t *, const mntopts_t *, mntopts_t *);
+void vfs_freeopttbl(mntopts_t *);
+void vfs_parsemntopts(mntopts_t *, char *, int);
+int vfs_buildoptionstr(const mntopts_t *, char *, int);
+struct mntopt *vfs_hasopt(const mntopts_t *, const char *);
+void vfs_mnttab_modtimeupd(void);
+
+void vfs_clearmntopt(struct vfs *, const char *);
+void vfs_setmntopt(struct vfs *, const char *, const char *, int);
+void vfs_setresource(struct vfs *, const char *);
+void vfs_setmntpoint(struct vfs *, const char *);
+refstr_t *vfs_getresource(const struct vfs *);
+refstr_t *vfs_getmntpoint(const struct vfs *);
+int vfs_optionisset(const struct vfs *, const char *, char **);
+int vfs_settag(uint_t, uint_t, const char *, const char *, cred_t *);
+int vfs_clrtag(uint_t, uint_t, const char *, const char *, cred_t *);
+void vfs_syncall(void);
+void vfs_syncprogress(void);
+void vfsinit(void);
+void vfs_unmountall(void);
+void vfs_make_fsid(fsid_t *, dev_t, int);
+void vfs_addmip(dev_t, struct vfs *);
+void vfs_delmip(struct vfs *);
+int vfs_devismounted(dev_t);
+int vfs_devmounting(dev_t, struct vfs *);
+int vfs_opsinuse(vfsops_t *);
+struct vfs *getvfs(fsid_t *);
+struct vfs *vfs_dev2vfsp(dev_t);
+struct vfs *vfs_mntpoint2vfsp(const char *);
+struct vfssw *allocate_vfssw(char *);
+struct vfssw *vfs_getvfssw(char *);
+struct vfssw *vfs_getvfsswbyname(char *);
+struct vfssw *vfs_getvfsswbyvfsops(vfsops_t *);
+void vfs_refvfssw(struct vfssw *);
+void vfs_unrefvfssw(struct vfssw *);
+uint_t vf_to_stf(uint_t);
+void vfs_mnttab_modtime(timespec_t *);
+void vfs_mnttab_poll(timespec_t *, struct pollhead **);
+
+void vfs_list_lock(void);
+void vfs_list_read_lock(void);
+void vfs_list_unlock(void);
+void vfs_list_add(struct vfs *);
+void vfs_list_remove(struct vfs *);
+void vfs_hold(vfs_t *vfsp);
+void vfs_rele(vfs_t *vfsp);
+void fs_freevfs(vfs_t *);
+void vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype);
+
+int vfs_zone_change_safe(vfs_t *);
+
+#define VFSHASH(maj, min) (((int)((maj)+(min))) & (vfshsz - 1))
+#define VFS_ON_LIST(vfsp) \
+ ((vfsp)->vfs_next != (vfsp) && (vfsp)->vfs_next != NULL)
+
+/*
+ * Globals.
+ */
+
+extern struct vfssw vfssw[]; /* table of filesystem types */
+extern krwlock_t vfssw_lock;
+extern char rootfstype[]; /* name of root fstype */
+extern const int nfstype; /* # of elements in vfssw array */
+extern vfsops_t *EIO_vfsops; /* operations for vfs being torn-down */
+
+/*
+ * The following variables are private to the the kernel's vfs layer. File
+ * system implementations should not access them.
+ */
+extern struct vfs *rootvfs; /* ptr to root vfs structure */
+typedef struct {
+ struct vfs *rvfs_head; /* head vfs in chain */
+ kmutex_t rvfs_lock; /* mutex protecting this chain */
+ uint32_t rvfs_len; /* length of this chain */
+} rvfs_t;
+extern rvfs_t *rvfs_list;
+extern int vfshsz; /* # of elements in rvfs_head array */
+extern const mntopts_t vfs_mntopts; /* globally recognized options */
+
+#endif /* defined(_KERNEL) */
+
+#define VFS_HOLD(vfsp) { \
+ vfs_hold(vfsp); \
+}
+
+#define VFS_RELE(vfsp) { \
+ vfs_rele(vfsp); \
+}
+
+#define VFS_INIT(vfsp, op, data) { \
+ vfs_init((vfsp), (op), (data)); \
+ vfsimpl_setup((vfsp)); \
+}
+
+
+#define VFS_INSTALLED(vfsswp) (((vfsswp)->vsw_flag & VSW_INSTALLED) != 0)
+#define ALLOCATED_VFSSW(vswp) ((vswp)->vsw_name[0] != '\0')
+#define RLOCK_VFSSW() (rw_enter(&vfssw_lock, RW_READER))
+#define RUNLOCK_VFSSW() (rw_exit(&vfssw_lock))
+#define WLOCK_VFSSW() (rw_enter(&vfssw_lock, RW_WRITER))
+#define WUNLOCK_VFSSW() (rw_exit(&vfssw_lock))
+#define VFSSW_LOCKED() (RW_LOCK_HELD(&vfssw_lock))
+#define VFSSW_WRITE_LOCKED() (RW_WRITE_HELD(&vfssw_lock))
+/*
+ * VFS_SYNC flags.
+ */
+#define SYNC_ATTR 0x01 /* sync attributes only */
+#define SYNC_CLOSE 0x02 /* close open file */
+#define SYNC_ALL 0x04 /* force to sync all fs */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/vmem.h b/sys/contrib/opensolaris/uts/common/sys/vmem.h
new file mode 100644
index 000000000000..1cd2f30e9ba3
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/vmem.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VMEM_H
+#define _SYS_VMEM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * Per-allocation flags
+ */
+#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */
+#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */
+#define VM_PANIC 0x00000002 /* same as KM_PANIC */
+#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */
+#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */
+
+#define VM_BESTFIT 0x00000100
+#define VM_FIRSTFIT 0x00000200
+#define VM_NEXTFIT 0x00000400
+
+/*
+ * The following flags are restricted for use only within the kernel.
+ * VM_MEMLOAD is for use by the HAT to avoid infinite recursion.
+ * VM_NORELOC is used by the kernel when static VA->PA mappings are required.
+ */
+#define VM_MEMLOAD 0x00000800
+#define VM_NORELOC 0x00001000
+/*
+ * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags
+ * and forgo reaping if the allocation or attempted import, fails. This
+ * flag is a segkmem-specific flag, and should not be used by anyone else.
+ */
+#define VM_ABORT 0x00002000
+
+#define VM_FLAGS 0x0000FFFF
+
+/*
+ * Arena creation flags
+ */
+#define VMC_POPULATOR 0x00010000
+#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */
+#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */
+/*
+ * internal use only; the import function uses the vmem_ximport_t interface
+ * and may increase the request size if it so desires
+ */
+#define VMC_XALLOC 0x00080000
+#define VMC_FLAGS 0xFFFF0000
+
+/*
+ * Public segment types
+ */
+#define VMEM_ALLOC 0x01
+#define VMEM_FREE 0x02
+
+/*
+ * Implementation-private segment types
+ */
+#define VMEM_SPAN 0x10
+#define VMEM_ROTOR 0x20
+#define VMEM_WALKER 0x40
+
+/*
+ * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may
+ * call back into the arena being walked, so vmem_walk() must drop the
+ * arena lock before each callback. The caveat is that since the arena
+ * isn't locked, its state can change. Therefore it is up to the callback
+ * routine to handle cases where the segment isn't of the expected type.
+ * For example, we use this to walk heap_arena when generating a crash dump;
+ * see segkmem_dump() for sample usage.
+ */
+#define VMEM_REENTRANT 0x80000000
+
+typedef struct vmem vmem_t;
+typedef void *(vmem_alloc_t)(vmem_t *, size_t, int);
+typedef void (vmem_free_t)(vmem_t *, void *, size_t);
+
+/*
+ * Alternate import style; the requested size is passed in a pointer,
+ * which can be increased by the import function if desired.
+ */
+typedef void *(vmem_ximport_t)(vmem_t *, size_t *, int);
+
+#ifdef _KERNEL
+extern vmem_t *vmem_init(const char *, void *, size_t, size_t,
+ vmem_alloc_t *, vmem_free_t *);
+extern void vmem_update(void *);
+extern int vmem_is_populator();
+extern size_t vmem_seg_size;
+#endif
+
+extern vmem_t *vmem_create(const char *, void *, size_t, size_t,
+ vmem_alloc_t *, vmem_free_t *, vmem_t *, size_t, int);
+extern vmem_t *vmem_xcreate(const char *, void *, size_t, size_t,
+ vmem_ximport_t *, vmem_free_t *, vmem_t *, size_t, int);
+extern void vmem_destroy(vmem_t *);
+extern void *vmem_alloc(vmem_t *, size_t, int);
+extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t,
+ void *, void *, int);
+extern void vmem_free(vmem_t *, void *, size_t);
+extern void vmem_xfree(vmem_t *, void *, size_t);
+extern void *vmem_add(vmem_t *, void *, size_t, int);
+extern int vmem_contains(vmem_t *, void *, size_t);
+extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), void *);
+extern size_t vmem_size(vmem_t *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VMEM_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/zmod.h b/sys/contrib/opensolaris/uts/common/sys/zmod.h
new file mode 100644
index 000000000000..ba0267203ce3
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/sys/zmod.h
@@ -0,0 +1,68 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZMOD_H
+#define _ZMOD_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * zmod - RFC-1950-compatible decompression routines
+ *
+ * This file provides the public interfaces to zmod, an in-kernel RFC 1950
+ * decompression library. More information about the implementation of these
+ * interfaces can be found in the usr/src/uts/common/zmod/ directory.
+ */
+
+#define Z_OK 0
+#define Z_STREAM_END 1
+#define Z_NEED_DICT 2
+#define Z_ERRNO (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR (-3)
+#define Z_MEM_ERROR (-4)
+#define Z_BUF_ERROR (-5)
+#define Z_VERSION_ERROR (-6)
+
+#define Z_NO_COMPRESSION 0
+#define Z_BEST_SPEED 1
+#define Z_BEST_COMPRESSION 9
+#define Z_DEFAULT_COMPRESSION (-1)
+
+extern int z_uncompress(void *, size_t *, const void *, size_t);
+extern int z_compress(void *, size_t *, const void *, size_t);
+extern int z_compress_level(void *, size_t *, const void *, size_t, int);
+extern const char *z_strerror(int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZMOD_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/adler32.c b/sys/contrib/opensolaris/uts/common/zmod/adler32.c
new file mode 100644
index 000000000000..59d84632ed5d
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/adler32.c
@@ -0,0 +1,149 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#define ZLIB_INTERNAL
+#include "zlib.h"
+
+#define BASE 65521UL /* largest prime smaller than 65536 */
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;}
+#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
+#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
+#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
+#define DO16(buf) DO8(buf,0); DO8(buf,8);
+
+/* use NO_DIVIDE if your processor does not do division in hardware */
+#ifdef NO_DIVIDE
+# define MOD(a) \
+ do { \
+ if (a >= (BASE << 16)) a -= (BASE << 16); \
+ if (a >= (BASE << 15)) a -= (BASE << 15); \
+ if (a >= (BASE << 14)) a -= (BASE << 14); \
+ if (a >= (BASE << 13)) a -= (BASE << 13); \
+ if (a >= (BASE << 12)) a -= (BASE << 12); \
+ if (a >= (BASE << 11)) a -= (BASE << 11); \
+ if (a >= (BASE << 10)) a -= (BASE << 10); \
+ if (a >= (BASE << 9)) a -= (BASE << 9); \
+ if (a >= (BASE << 8)) a -= (BASE << 8); \
+ if (a >= (BASE << 7)) a -= (BASE << 7); \
+ if (a >= (BASE << 6)) a -= (BASE << 6); \
+ if (a >= (BASE << 5)) a -= (BASE << 5); \
+ if (a >= (BASE << 4)) a -= (BASE << 4); \
+ if (a >= (BASE << 3)) a -= (BASE << 3); \
+ if (a >= (BASE << 2)) a -= (BASE << 2); \
+ if (a >= (BASE << 1)) a -= (BASE << 1); \
+ if (a >= BASE) a -= BASE; \
+ } while (0)
+# define MOD4(a) \
+ do { \
+ if (a >= (BASE << 4)) a -= (BASE << 4); \
+ if (a >= (BASE << 3)) a -= (BASE << 3); \
+ if (a >= (BASE << 2)) a -= (BASE << 2); \
+ if (a >= (BASE << 1)) a -= (BASE << 1); \
+ if (a >= BASE) a -= BASE; \
+ } while (0)
+#else
+# define MOD(a) a %= BASE
+# define MOD4(a) a %= BASE
+#endif
+
+/* ========================================================================= */
+uLong ZEXPORT adler32(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned long sum2;
+ unsigned n;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (len == 1) {
+ adler += buf[0];
+ if (adler >= BASE)
+ adler -= BASE;
+ sum2 += adler;
+ if (sum2 >= BASE)
+ sum2 -= BASE;
+ return adler | (sum2 << 16);
+ }
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (buf == Z_NULL)
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (len < 16) {
+ while (len--) {
+ adler += *buf++;
+ sum2 += adler;
+ }
+ if (adler >= BASE)
+ adler -= BASE;
+ MOD4(sum2); /* only added so many BASE's */
+ return adler | (sum2 << 16);
+ }
+
+ /* do length NMAX blocks -- requires just one modulo operation */
+ while (len >= NMAX) {
+ len -= NMAX;
+ n = NMAX / 16; /* NMAX is divisible by 16 */
+ do {
+ DO16(buf); /* 16 sums unrolled */
+ buf += 16;
+ } while (--n);
+ MOD(adler);
+ MOD(sum2);
+ }
+
+ /* do remaining bytes (less than NMAX, still just one modulo) */
+ if (len) { /* avoid modulos if none remaining */
+ while (len >= 16) {
+ len -= 16;
+ DO16(buf);
+ buf += 16;
+ }
+ while (len--) {
+ adler += *buf++;
+ sum2 += adler;
+ }
+ MOD(adler);
+ MOD(sum2);
+ }
+
+ /* return recombined sums */
+ return adler | (sum2 << 16);
+}
+
+/* ========================================================================= */
+uLong ZEXPORT adler32_combine(adler1, adler2, len2)
+ uLong adler1;
+ uLong adler2;
+ z_off_t len2;
+{
+ unsigned long sum1;
+ unsigned long sum2;
+ unsigned rem;
+
+ /* the derivation of this formula is left as an exercise for the reader */
+ rem = (unsigned)(len2 % BASE);
+ sum1 = adler1 & 0xffff;
+ sum2 = rem * sum1;
+ MOD(sum2);
+ sum1 += (adler2 & 0xffff) + BASE - 1;
+ sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
+ if (sum1 > BASE) sum1 -= BASE;
+ if (sum1 > BASE) sum1 -= BASE;
+ if (sum2 > (BASE << 1)) sum2 -= (BASE << 1);
+ if (sum2 > BASE) sum2 -= BASE;
+ return sum1 | (sum2 << 16);
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/crc32.c b/sys/contrib/opensolaris/uts/common/zmod/crc32.c
new file mode 100644
index 000000000000..61ad581ef562
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/crc32.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors. This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+ protection on the static variables used to control the first-use generation
+ of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+ first call get_crc_table() to initialize the tables before allowing more than
+ one thread to use crc32().
+ */
+
+#ifdef MAKECRCH
+# include <stdio.h>
+# ifndef DYNAMIC_CRC_TABLE
+# define DYNAMIC_CRC_TABLE
+# endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+#include "zutil.h" /* for STDC and FAR definitions */
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+# ifdef STDC /* need ANSI C limits.h to determine sizes */
+# include <limits.h>
+# define BYFOUR
+# if (UINT_MAX == 0xffffffffUL)
+ typedef unsigned int u4;
+# else
+# if (ULONG_MAX == 0xffffffffUL)
+ typedef unsigned long u4;
+# else
+# if (USHRT_MAX == 0xffffffffUL)
+ typedef unsigned short u4;
+# else
+# undef BYFOUR /* can't find a four-byte integer type! */
+# endif
+# endif
+# endif
+# endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+ (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+ local unsigned long crc32_little OF((unsigned long,
+ const unsigned char FAR *, unsigned));
+ local unsigned long crc32_big OF((unsigned long,
+ const unsigned char FAR *, unsigned));
+# define TBLS 8
+#else
+# define TBLS 1
+#endif /* BYFOUR */
+
+/* Local functions for crc concatenation */
+local unsigned long gf2_matrix_times OF((unsigned long *mat,
+ unsigned long vec));
+local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+ local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+/*
+ Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+ x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+ Polynomials over GF(2) are represented in binary, one bit per coefficient,
+ with the lowest powers in the most significant bit. Then adding polynomials
+ is just exclusive-or, and multiplying a polynomial by x is a right shift by
+ one. If we call the above polynomial p, and represent a byte as the
+ polynomial q, also with the lowest power in the most significant bit (so the
+ byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+ where a mod b means the remainder after dividing a by b.
+
+ This calculation is done using the shift-register method of multiplying and
+ taking the remainder. The register is initialized to zero, and for each
+ incoming bit, x^32 is added mod p to the register if the bit is a one (where
+ x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+ x (which is shifting right by one and adding x^32 mod p if the bit shifted
+ out is a one). We start with the highest power (least significant bit) of
+ q and repeat for all eight bits of q.
+
+ The first table is simply the CRC of all possible eight bit values. This is
+ all the information needed to generate CRCs on data a byte at a time for all
+ combinations of CRC register values and incoming bytes. The remaining tables
+ allow for word-at-a-time CRC calculation for both big-endian and little-
+ endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+ unsigned long c;
+ int n, k;
+ unsigned long poly; /* polynomial exclusive-or pattern */
+ /* terms of polynomial defining this crc (except x^32): */
+ static volatile int first = 1; /* flag to limit concurrent making */
+ static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+ /* See if another task is already doing this (not thread-safe, but better
+ than nothing -- significantly reduces duration of vulnerability in
+ case the advice about DYNAMIC_CRC_TABLE is ignored) */
+ if (first) {
+ first = 0;
+
+ /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+ poly = 0UL;
+ for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+ poly |= 1UL << (31 - p[n]);
+
+ /* generate a crc for every 8-bit value */
+ for (n = 0; n < 256; n++) {
+ c = (unsigned long)n;
+ for (k = 0; k < 8; k++)
+ c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+ crc_table[0][n] = c;
+ }
+
+#ifdef BYFOUR
+ /* generate crc for each value followed by one, two, and three zeros,
+ and then the byte reversal of those as well as the first table */
+ for (n = 0; n < 256; n++) {
+ c = crc_table[0][n];
+ crc_table[4][n] = REV(c);
+ for (k = 1; k < 4; k++) {
+ c = crc_table[0][c & 0xff] ^ (c >> 8);
+ crc_table[k][n] = c;
+ crc_table[k + 4][n] = REV(c);
+ }
+ }
+#endif /* BYFOUR */
+
+ crc_table_empty = 0;
+ }
+ else { /* not first */
+ /* wait for the other guy to finish (not efficient, but rare) */
+ while (crc_table_empty)
+ ;
+ }
+
+#ifdef MAKECRCH
+ /* write out CRC tables to crc32.h */
+ {
+ FILE *out;
+
+ out = fopen("crc32.h", "w");
+ if (out == NULL) return;
+ fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+ fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+ fprintf(out, "local const unsigned long FAR ");
+ fprintf(out, "crc_table[TBLS][256] =\n{\n {\n");
+ write_table(out, crc_table[0]);
+# ifdef BYFOUR
+ fprintf(out, "#ifdef BYFOUR\n");
+ for (k = 1; k < 8; k++) {
+ fprintf(out, " },\n {\n");
+ write_table(out, crc_table[k]);
+ }
+ fprintf(out, "#endif\n");
+# endif /* BYFOUR */
+ fprintf(out, " }\n};\n");
+ fclose(out);
+ }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+ FILE *out;
+ const unsigned long FAR *table;
+{
+ int n;
+
+ for (n = 0; n < 256; n++)
+ fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n],
+ n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT get_crc_table()
+{
+#ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+ return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+ if (sizeof(void *) == sizeof(ptrdiff_t)) {
+ u4 endian;
+
+ endian = 1;
+ if (*((unsigned char *)(&endian)))
+ return crc32_little(crc, buf, len);
+ else
+ return crc32_big(crc, buf, len);
+ }
+#endif /* BYFOUR */
+ crc = crc ^ 0xffffffffUL;
+ while (len >= 8) {
+ DO8;
+ len -= 8;
+ }
+ if (len) do {
+ DO1;
+ } while (--len);
+ return crc ^ 0xffffffffUL;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+ c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ register u4 c;
+ register const u4 FAR *buf4;
+
+ c = (u4)crc;
+ c = ~c;
+ while (len && ((ptrdiff_t)buf & 3)) {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ len--;
+ }
+
+ buf4 = (const u4 FAR *)(const void FAR *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const unsigned char FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ } while (--len);
+ c = ~c;
+ return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+ c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ register u4 c;
+ register const u4 FAR *buf4;
+
+ c = REV((u4)crc);
+ c = ~c;
+ while (len && ((ptrdiff_t)buf & 3)) {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ len--;
+ }
+
+ buf4 = (const u4 FAR *)(const void FAR *)buf;
+ buf4--;
+ while (len >= 32) {
+ DOBIG32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOBIG4;
+ len -= 4;
+ }
+ buf4++;
+ buf = (const unsigned char FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ } while (--len);
+ c = ~c;
+ return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
+
+#define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */
+
+/* ========================================================================= */
+local unsigned long gf2_matrix_times(mat, vec)
+ unsigned long *mat;
+ unsigned long vec;
+{
+ unsigned long sum;
+
+ sum = 0;
+ while (vec) {
+ if (vec & 1)
+ sum ^= *mat;
+ vec >>= 1;
+ mat++;
+ }
+ return sum;
+}
+
+/* ========================================================================= */
+local void gf2_matrix_square(square, mat)
+ unsigned long *square;
+ unsigned long *mat;
+{
+ int n;
+
+ for (n = 0; n < GF2_DIM; n++)
+ square[n] = gf2_matrix_times(mat, mat[n]);
+}
+
+/* ========================================================================= */
+uLong ZEXPORT crc32_combine(crc1, crc2, len2)
+ uLong crc1;
+ uLong crc2;
+ z_off_t len2;
+{
+ int n;
+ unsigned long row;
+ unsigned long even[GF2_DIM]; /* even-power-of-two zeros operator */
+ unsigned long odd[GF2_DIM]; /* odd-power-of-two zeros operator */
+
+ /* degenerate case */
+ if (len2 == 0)
+ return crc1;
+
+ /* put operator for one zero bit in odd */
+ odd[0] = 0xedb88320UL; /* CRC-32 polynomial */
+ row = 1;
+ for (n = 1; n < GF2_DIM; n++) {
+ odd[n] = row;
+ row <<= 1;
+ }
+
+ /* put operator for two zero bits in even */
+ gf2_matrix_square(even, odd);
+
+ /* put operator for four zero bits in odd */
+ gf2_matrix_square(odd, even);
+
+ /* apply len2 zeros to crc1 (first square will put the operator for one
+ zero byte, eight zero bits, in even) */
+ do {
+ /* apply zeros operator for this bit of len2 */
+ gf2_matrix_square(even, odd);
+ if (len2 & 1)
+ crc1 = gf2_matrix_times(even, crc1);
+ len2 >>= 1;
+
+ /* if no more bits set, then done */
+ if (len2 == 0)
+ break;
+
+ /* another iteration of the loop with odd and even swapped */
+ gf2_matrix_square(odd, even);
+ if (len2 & 1)
+ crc1 = gf2_matrix_times(odd, crc1);
+ len2 >>= 1;
+
+ /* if no more bits set, then done */
+ } while (len2 != 0);
+
+ /* return combined crc */
+ crc1 ^= crc2;
+ return crc1;
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/crc32.h b/sys/contrib/opensolaris/uts/common/zmod/crc32.h
new file mode 100644
index 000000000000..495c83e03179
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/crc32.h
@@ -0,0 +1,443 @@
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+local const unsigned long FAR crc_table[TBLS][256] =
+{
+ {
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+ 0x2d02ef8dUL
+#ifdef BYFOUR
+ },
+ {
+ 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+ 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+ 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+ 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+ 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+ 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+ 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+ 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+ 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+ 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+ 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+ 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+ 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+ 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+ 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+ 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+ 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+ 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+ 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+ 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+ 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+ 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+ 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+ 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+ 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+ 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+ 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+ 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+ 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+ 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+ 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+ 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+ 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+ 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+ 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+ 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+ 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+ 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+ 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+ 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+ 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+ 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+ 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+ 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+ 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+ 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+ 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+ 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+ 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+ 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+ 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+ 0x9324fd72UL
+ },
+ {
+ 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+ 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+ 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+ 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+ 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+ 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+ 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+ 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+ 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+ 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+ 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+ 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+ 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+ 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+ 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+ 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+ 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+ 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+ 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+ 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+ 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+ 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+ 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+ 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+ 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+ 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+ 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+ 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+ 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+ 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+ 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+ 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+ 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+ 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+ 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+ 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+ 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+ 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+ 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+ 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+ 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+ 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+ 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+ 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+ 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+ 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+ 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+ 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+ 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+ 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+ 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+ 0xbe9834edUL
+ },
+ {
+ 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+ 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+ 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+ 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+ 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+ 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+ 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+ 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+ 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+ 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+ 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+ 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+ 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+ 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+ 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+ 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+ 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+ 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+ 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+ 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+ 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+ 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+ 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+ 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+ 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+ 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+ 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+ 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+ 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+ 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+ 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+ 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+ 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+ 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+ 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+ 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+ 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+ 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+ 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+ 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+ 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+ 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+ 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+ 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+ 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+ 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+ 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+ 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+ 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+ 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+ 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+ 0xde0506f1UL
+ },
+ {
+ 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+ 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+ 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+ 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+ 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+ 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+ 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+ 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+ 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+ 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+ 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+ 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+ 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+ 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+ 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+ 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+ 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+ 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+ 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+ 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+ 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+ 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+ 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+ 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+ 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+ 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+ 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+ 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+ 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+ 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+ 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+ 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+ 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+ 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+ 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+ 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+ 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+ 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+ 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+ 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+ 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+ 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+ 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+ 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+ 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+ 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+ 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+ 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+ 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+ 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+ 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+ 0x8def022dUL
+ },
+ {
+ 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+ 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+ 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+ 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+ 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+ 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+ 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+ 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+ 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+ 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+ 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+ 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+ 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+ 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+ 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+ 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+ 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+ 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+ 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+ 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+ 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+ 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+ 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+ 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+ 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+ 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+ 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+ 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+ 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+ 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+ 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+ 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+ 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+ 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+ 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+ 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+ 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+ 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+ 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+ 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+ 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+ 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+ 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+ 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+ 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+ 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+ 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+ 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+ 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+ 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+ 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+ 0x72fd2493UL
+ },
+ {
+ 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+ 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+ 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+ 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+ 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+ 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+ 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+ 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+ 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+ 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+ 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+ 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+ 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+ 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+ 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+ 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+ 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+ 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+ 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+ 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+ 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+ 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+ 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+ 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+ 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+ 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+ 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+ 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+ 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+ 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+ 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+ 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+ 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+ 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+ 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+ 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+ 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+ 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+ 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+ 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+ 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+ 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+ 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+ 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+ 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+ 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+ 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+ 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+ 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+ 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+ 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+ 0xed3498beUL
+ },
+ {
+ 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+ 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+ 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+ 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+ 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+ 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+ 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+ 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+ 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+ 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+ 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+ 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+ 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+ 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+ 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+ 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+ 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+ 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+ 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+ 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+ 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+ 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+ 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+ 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+ 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+ 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+ 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+ 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+ 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+ 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+ 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+ 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+ 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+ 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+ 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+ 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+ 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+ 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+ 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+ 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+ 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+ 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+ 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+ 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+ 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+ 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+ 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+ 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+ 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+ 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+ 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+ 0xf10605deUL
+#endif
+ }
+};
diff --git a/sys/contrib/opensolaris/uts/common/zmod/deflate.c b/sys/contrib/opensolaris/uts/common/zmod/deflate.c
new file mode 100644
index 000000000000..7847e40ba327
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/deflate.c
@@ -0,0 +1,1742 @@
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* deflate.c -- compress data using the deflation algorithm
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ALGORITHM
+ *
+ * The "deflation" process depends on being able to identify portions
+ * of the input text which are identical to earlier input (within a
+ * sliding window trailing behind the input currently being processed).
+ *
+ * The most straightforward technique turns out to be the fastest for
+ * most input files: try all possible matches and select the longest.
+ * The key feature of this algorithm is that insertions into the string
+ * dictionary are very simple and thus fast, and deletions are avoided
+ * completely. Insertions are performed at each input character, whereas
+ * string matches are performed only when the previous match ends. So it
+ * is preferable to spend more time in matches to allow very fast string
+ * insertions and avoid deletions. The matching algorithm for small
+ * strings is inspired from that of Rabin & Karp. A brute force approach
+ * is used to find longer strings when a small match has been found.
+ * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
+ * (by Leonid Broukhis).
+ * A previous version of this file used a more sophisticated algorithm
+ * (by Fiala and Greene) which is guaranteed to run in linear amortized
+ * time, but has a larger average cost, uses more memory and is patented.
+ * However the F&G algorithm may be faster for some highly redundant
+ * files if the parameter max_chain_length (described below) is too large.
+ *
+ * ACKNOWLEDGEMENTS
+ *
+ * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
+ * I found it in 'freeze' written by Leonid Broukhis.
+ * Thanks to many people for bug reports and testing.
+ *
+ * REFERENCES
+ *
+ * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
+ * Available in http://www.ietf.org/rfc/rfc1951.txt
+ *
+ * A description of the Rabin and Karp algorithm is given in the book
+ * "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
+ *
+ * Fiala,E.R., and Greene,D.H.
+ * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
+ *
+ */
+
+#include "deflate.h"
+
+static const char deflate_copyright[] =
+ " deflate 1.2.3 Copyright 1995-2005 Jean-loup Gailly ";
+/*
+ If you use the zlib library in a product, an acknowledgment is welcome
+ in the documentation of your product. If for some reason you cannot
+ include such an acknowledgment, I would appreciate that you keep this
+ copyright string in the executable of your product.
+ */
+
+/* ===========================================================================
+ * Function prototypes.
+ */
+typedef enum {
+ need_more, /* block not completed, need more input or more output */
+ block_done, /* block flush performed */
+ finish_started, /* finish started, need only more output at next deflate */
+ finish_done /* finish done, accept no more input or output */
+} block_state;
+
+typedef block_state (*compress_func) OF((deflate_state *s, int flush));
+/* Compression function. Returns the block state after the call. */
+
+local void fill_window OF((deflate_state *s));
+local block_state deflate_stored OF((deflate_state *s, int flush));
+local block_state deflate_fast OF((deflate_state *s, int flush));
+#ifndef FASTEST
+local block_state deflate_slow OF((deflate_state *s, int flush));
+#endif
+local void lm_init OF((deflate_state *s));
+local void putShortMSB OF((deflate_state *s, uInt b));
+local void flush_pending OF((z_streamp strm));
+local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+#ifndef FASTEST
+#ifdef ASMV
+ void match_init OF((void)); /* asm code initialization */
+ uInt longest_match OF((deflate_state *s, IPos cur_match));
+#else
+local uInt longest_match OF((deflate_state *s, IPos cur_match));
+#endif
+#endif
+local uInt longest_match_fast OF((deflate_state *s, IPos cur_match));
+
+#ifdef DEBUG
+local void check_match OF((deflate_state *s, IPos start, IPos match,
+ int length));
+#endif
+
+/* ===========================================================================
+ * Local data
+ */
+
+#define NIL 0
+/* Tail of hash chains */
+
+#ifndef TOO_FAR
+# define TOO_FAR 4096
+#endif
+/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */
+
+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the MIN_MATCH+1.
+ */
+
+/* Values for max_lazy_match, good_match and max_chain_length, depending on
+ * the desired pack level (0..9). The values given below have been tuned to
+ * exclude worst case performance for pathological files. Better values may be
+ * found for specific files.
+ */
+typedef struct config_s {
+ ush good_length; /* reduce lazy search above this match length */
+ ush max_lazy; /* do not perform lazy search above this match length */
+ ush nice_length; /* quit search above this match length */
+ ush max_chain;
+ compress_func func;
+} config;
+
+#ifdef FASTEST
+local const config configuration_table[2] = {
+/* good lazy nice chain */
+/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */
+/* 1 */ {4, 4, 8, 4, deflate_fast}}; /* max speed, no lazy matches */
+#else
+local const config configuration_table[10] = {
+/* good lazy nice chain */
+/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */
+/* 1 */ {4, 4, 8, 4, deflate_fast}, /* max speed, no lazy matches */
+/* 2 */ {4, 5, 16, 8, deflate_fast},
+/* 3 */ {4, 6, 32, 32, deflate_fast},
+
+/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */
+/* 5 */ {8, 16, 32, 32, deflate_slow},
+/* 6 */ {8, 16, 128, 128, deflate_slow},
+/* 7 */ {8, 32, 128, 256, deflate_slow},
+/* 8 */ {32, 128, 258, 1024, deflate_slow},
+/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* max compression */
+#endif
+
+/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
+ * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
+ * meaning.
+ */
+
+#define EQUAL 0
+/* result of memcmp for equal strings */
+
+#ifndef NO_DUMMY_DECL
+struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
+#endif
+
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN assertion: all calls to to UPDATE_HASH are made with consecutive
+ * input characters, so that a running hash key can be computed from the
+ * previous key instead of complete recalculation each time.
+ */
+#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
+
+
+/* ===========================================================================
+ * Insert string str in the dictionary and set match_head to the previous head
+ * of the hash chain (the most recent string with same hash key). Return
+ * the previous length of the hash chain.
+ * If this file is compiled with -DFASTEST, the compression level is forced
+ * to 1, and no hash chains are maintained.
+ * IN assertion: all calls to to INSERT_STRING are made with consecutive
+ * input characters and the first MIN_MATCH bytes of str are valid
+ * (except for the last MIN_MATCH-1 bytes of the input file).
+ */
+#ifdef FASTEST
+#define INSERT_STRING(s, str, match_head) \
+ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+ match_head = s->head[s->ins_h], \
+ s->head[s->ins_h] = (Pos)(str))
+#else
+#define INSERT_STRING(s, str, match_head) \
+ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+ match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
+ s->head[s->ins_h] = (Pos)(str))
+#endif
+
+/* ===========================================================================
+ * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
+ * prev[] will be initialized on the fly.
+ */
+#define CLEAR_HASH(s) \
+ s->head[s->hash_size-1] = NIL; \
+ (void) zmemzero((Bytef *)s->head, \
+ (unsigned)(s->hash_size-1)*sizeof(*s->head));
+
+/* ========================================================================= */
+int ZEXPORT deflateInit_(strm, level, version, stream_size)
+ z_streamp strm;
+ int level;
+ const char *version;
+ int stream_size;
+{
+ return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL,
+ Z_DEFAULT_STRATEGY, version, stream_size);
+ /* To do: ignore strm->next_in if we use it as window */
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
+ version, stream_size)
+ z_streamp strm;
+ int level;
+ int method;
+ int windowBits;
+ int memLevel;
+ int strategy;
+ const char *version;
+ int stream_size;
+{
+ deflate_state *s;
+ int wrap = 1;
+ static const char my_version[] = ZLIB_VERSION;
+
+ ushf *overlay;
+ /* We overlay pending_buf and d_buf+l_buf. This works since the average
+ * output size for (length,distance) codes is <= 24 bits.
+ */
+
+ if (version == Z_NULL || version[0] != my_version[0] ||
+ stream_size != sizeof(z_stream)) {
+ return Z_VERSION_ERROR;
+ }
+ if (strm == Z_NULL) return Z_STREAM_ERROR;
+
+ strm->msg = Z_NULL;
+ if (strm->zalloc == (alloc_func)0) {
+ strm->zalloc = zcalloc;
+ strm->opaque = (voidpf)0;
+ }
+ if (strm->zfree == (free_func)0) strm->zfree = zcfree;
+
+#ifdef FASTEST
+ if (level != 0) level = 1;
+#else
+ if (level == Z_DEFAULT_COMPRESSION) level = 6;
+#endif
+
+ if (windowBits < 0) { /* suppress zlib wrapper */
+ wrap = 0;
+ windowBits = -windowBits;
+ }
+#ifdef GZIP
+ else if (windowBits > 15) {
+ wrap = 2; /* write gzip wrapper instead */
+ windowBits -= 16;
+ }
+#endif
+ if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED ||
+ windowBits < 8 || windowBits > 15 || level < 0 || level > 9 ||
+ strategy < 0 || strategy > Z_FIXED) {
+ return Z_STREAM_ERROR;
+ }
+ if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */
+ s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state));
+ if (s == Z_NULL) return Z_MEM_ERROR;
+ strm->state = (struct internal_state FAR *)s;
+ s->strm = strm;
+
+ s->wrap = wrap;
+ s->gzhead = Z_NULL;
+ s->w_bits = windowBits;
+ s->w_size = 1 << s->w_bits;
+ s->w_mask = s->w_size - 1;
+
+ s->hash_bits = memLevel + 7;
+ s->hash_size = 1 << s->hash_bits;
+ s->hash_mask = s->hash_size - 1;
+ s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
+
+ s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
+ s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
+ s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos));
+
+ s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
+
+ overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2);
+ s->pending_buf = (uchf *) overlay;
+ s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L);
+
+ if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
+ s->pending_buf == Z_NULL) {
+ s->status = FINISH_STATE;
+ strm->msg = (char*)ERR_MSG(Z_MEM_ERROR);
+ (void) deflateEnd (strm);
+ return Z_MEM_ERROR;
+ }
+ s->d_buf = overlay + s->lit_bufsize/sizeof(ush);
+ s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize;
+
+ s->level = level;
+ s->strategy = strategy;
+ s->method = (Byte)method;
+
+ return deflateReset(strm);
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
+ z_streamp strm;
+ const Bytef *dictionary;
+ uInt dictLength;
+{
+ deflate_state *s;
+ uInt length = dictLength;
+ uInt n;
+ IPos hash_head = 0;
+
+ if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL ||
+ strm->state->wrap == 2 ||
+ (strm->state->wrap == 1 && strm->state->status != INIT_STATE))
+ return Z_STREAM_ERROR;
+
+ s = strm->state;
+ if (s->wrap)
+ strm->adler = adler32(strm->adler, dictionary, dictLength);
+
+ if (length < MIN_MATCH) return Z_OK;
+ if (length > MAX_DIST(s)) {
+ length = MAX_DIST(s);
+ dictionary += dictLength - length; /* use the tail of the dictionary */
+ }
+ (void) zmemcpy(s->window, dictionary, length);
+ s->strstart = length;
+ s->block_start = (long)length;
+
+ /* Insert all strings in the hash table (except for the last two bytes).
+ * s->lookahead stays null, so s->ins_h will be recomputed at the next
+ * call of fill_window.
+ */
+ s->ins_h = s->window[0];
+ UPDATE_HASH(s, s->ins_h, s->window[1]);
+ for (n = 0; n <= length - MIN_MATCH; n++) {
+ INSERT_STRING(s, n, hash_head);
+ }
+ if (hash_head) hash_head = 0; /* to make compiler happy */
+ return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateReset (strm)
+ z_streamp strm;
+{
+ deflate_state *s;
+
+ if (strm == Z_NULL || strm->state == Z_NULL ||
+ strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0) {
+ return Z_STREAM_ERROR;
+ }
+
+ strm->total_in = strm->total_out = 0;
+ strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */
+ strm->data_type = Z_UNKNOWN;
+
+ s = (deflate_state *)strm->state;
+ s->pending = 0;
+ s->pending_out = s->pending_buf;
+
+ if (s->wrap < 0) {
+ s->wrap = -s->wrap; /* was made negative by deflate(..., Z_FINISH); */
+ }
+ s->status = s->wrap ? INIT_STATE : BUSY_STATE;
+ strm->adler =
+#ifdef GZIP
+ s->wrap == 2 ? crc32(0L, Z_NULL, 0) :
+#endif
+ adler32(0L, Z_NULL, 0);
+ s->last_flush = Z_NO_FLUSH;
+
+ _tr_init(s);
+ lm_init(s);
+
+ return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateSetHeader (strm, head)
+ z_streamp strm;
+ gz_headerp head;
+{
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ if (strm->state->wrap != 2) return Z_STREAM_ERROR;
+ strm->state->gzhead = head;
+ return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflatePrime (strm, bits, value)
+ z_streamp strm;
+ int bits;
+ int value;
+{
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ strm->state->bi_valid = bits;
+ strm->state->bi_buf = (ush)(value & ((1 << bits) - 1));
+ return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateParams(strm, level, strategy)
+ z_streamp strm;
+ int level;
+ int strategy;
+{
+ deflate_state *s;
+ compress_func func;
+ int err = Z_OK;
+
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ s = strm->state;
+
+#ifdef FASTEST
+ if (level != 0) level = 1;
+#else
+ if (level == Z_DEFAULT_COMPRESSION) level = 6;
+#endif
+ if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED) {
+ return Z_STREAM_ERROR;
+ }
+ func = configuration_table[s->level].func;
+
+ if (func != configuration_table[level].func && strm->total_in != 0) {
+ /* Flush the last buffer: */
+ err = deflate(strm, Z_PARTIAL_FLUSH);
+ }
+ if (s->level != level) {
+ s->level = level;
+ s->max_lazy_match = configuration_table[level].max_lazy;
+ s->good_match = configuration_table[level].good_length;
+ s->nice_match = configuration_table[level].nice_length;
+ s->max_chain_length = configuration_table[level].max_chain;
+ }
+ s->strategy = strategy;
+ return err;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
+ z_streamp strm;
+ int good_length;
+ int max_lazy;
+ int nice_length;
+ int max_chain;
+{
+ deflate_state *s;
+
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ s = strm->state;
+ s->good_match = good_length;
+ s->max_lazy_match = max_lazy;
+ s->nice_match = nice_length;
+ s->max_chain_length = max_chain;
+ return Z_OK;
+}
+
+/* =========================================================================
+ * For the default windowBits of 15 and memLevel of 8, this function returns
+ * a close to exact, as well as small, upper bound on the compressed size.
+ * They are coded as constants here for a reason--if the #define's are
+ * changed, then this function needs to be changed as well. The return
+ * value for 15 and 8 only works for those exact settings.
+ *
+ * For any setting other than those defaults for windowBits and memLevel,
+ * the value returned is a conservative worst case for the maximum expansion
+ * resulting from using fixed blocks instead of stored blocks, which deflate
+ * can emit on compressed data for some combinations of the parameters.
+ *
+ * This function could be more sophisticated to provide closer upper bounds
+ * for every combination of windowBits and memLevel, as well as wrap.
+ * But even the conservative upper bound of about 14% expansion does not
+ * seem onerous for output buffer allocation.
+ */
+uLong ZEXPORT deflateBound(strm, sourceLen)
+ z_streamp strm;
+ uLong sourceLen;
+{
+ deflate_state *s;
+ uLong destLen;
+
+ /* conservative upper bound */
+ destLen = sourceLen +
+ ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 11;
+
+ /* if can't get parameters, return conservative bound */
+ if (strm == Z_NULL || strm->state == Z_NULL)
+ return destLen;
+
+ /* if not default parameters, return conservative bound */
+ s = strm->state;
+ if (s->w_bits != 15 || s->hash_bits != 8 + 7)
+ return destLen;
+
+ /* default settings: return tight bound for that case */
+ return compressBound(sourceLen);
+}
+
+/* =========================================================================
+ * Put a short in the pending buffer. The 16-bit value is put in MSB order.
+ * IN assertion: the stream state is correct and there is enough room in
+ * pending_buf.
+ */
+local void putShortMSB (s, b)
+ deflate_state *s;
+ uInt b;
+{
+ put_byte(s, (Byte)(b >> 8));
+ put_byte(s, (Byte)(b & 0xff));
+}
+
+/* =========================================================================
+ * Flush as much pending output as possible. All deflate() output goes
+ * through this function so some applications may wish to modify it
+ * to avoid allocating a large strm->next_out buffer and copying into it.
+ * (See also read_buf()).
+ */
+local void flush_pending(strm)
+ z_streamp strm;
+{
+ unsigned len = strm->state->pending;
+
+ if (len > strm->avail_out) len = strm->avail_out;
+ if (len == 0) return;
+
+ zmemcpy(strm->next_out, strm->state->pending_out, len);
+ strm->next_out += len;
+ strm->state->pending_out += len;
+ strm->total_out += len;
+ strm->avail_out -= len;
+ strm->state->pending -= len;
+ if (strm->state->pending == 0) {
+ strm->state->pending_out = strm->state->pending_buf;
+ }
+}
+
+/* ========================================================================= */
+int ZEXPORT deflate (strm, flush)
+ z_streamp strm;
+ int flush;
+{
+ int old_flush; /* value of flush param for previous deflate call */
+ deflate_state *s;
+
+ if (strm == Z_NULL || strm->state == Z_NULL ||
+ flush > Z_FINISH || flush < 0) {
+ return Z_STREAM_ERROR;
+ }
+ s = strm->state;
+
+ if (strm->next_out == Z_NULL ||
+ (strm->next_in == Z_NULL && strm->avail_in != 0) ||
+ (s->status == FINISH_STATE && flush != Z_FINISH)) {
+ ERR_RETURN(strm, Z_STREAM_ERROR);
+ }
+ if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR);
+
+ s->strm = strm; /* just in case */
+ old_flush = s->last_flush;
+ s->last_flush = flush;
+
+ /* Write the header */
+ if (s->status == INIT_STATE) {
+#ifdef GZIP
+ if (s->wrap == 2) {
+ strm->adler = crc32(0L, Z_NULL, 0);
+ put_byte(s, 31);
+ put_byte(s, 139);
+ put_byte(s, 8);
+ if (s->gzhead == NULL) {
+ put_byte(s, 0);
+ put_byte(s, 0);
+ put_byte(s, 0);
+ put_byte(s, 0);
+ put_byte(s, 0);
+ put_byte(s, s->level == 9 ? 2 :
+ (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
+ 4 : 0));
+ put_byte(s, OS_CODE);
+ s->status = BUSY_STATE;
+ }
+ else {
+ put_byte(s, (s->gzhead->text ? 1 : 0) +
+ (s->gzhead->hcrc ? 2 : 0) +
+ (s->gzhead->extra == Z_NULL ? 0 : 4) +
+ (s->gzhead->name == Z_NULL ? 0 : 8) +
+ (s->gzhead->comment == Z_NULL ? 0 : 16)
+ );
+ put_byte(s, (Byte)(s->gzhead->time & 0xff));
+ put_byte(s, (Byte)((s->gzhead->time >> 8) & 0xff));
+ put_byte(s, (Byte)((s->gzhead->time >> 16) & 0xff));
+ put_byte(s, (Byte)((s->gzhead->time >> 24) & 0xff));
+ put_byte(s, s->level == 9 ? 2 :
+ (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
+ 4 : 0));
+ put_byte(s, s->gzhead->os & 0xff);
+ if (s->gzhead->extra != NULL) {
+ put_byte(s, s->gzhead->extra_len & 0xff);
+ put_byte(s, (s->gzhead->extra_len >> 8) & 0xff);
+ }
+ if (s->gzhead->hcrc)
+ strm->adler = crc32(strm->adler, s->pending_buf,
+ s->pending);
+ s->gzindex = 0;
+ s->status = EXTRA_STATE;
+ }
+ }
+ else
+#endif
+ {
+ uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
+ uInt level_flags;
+
+ if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
+ level_flags = 0;
+ else if (s->level < 6)
+ level_flags = 1;
+ else if (s->level == 6)
+ level_flags = 2;
+ else
+ level_flags = 3;
+ header |= (level_flags << 6);
+ if (s->strstart != 0) header |= PRESET_DICT;
+ header += 31 - (header % 31);
+
+ s->status = BUSY_STATE;
+ putShortMSB(s, header);
+
+ /* Save the adler32 of the preset dictionary: */
+ if (s->strstart != 0) {
+ putShortMSB(s, (uInt)(strm->adler >> 16));
+ putShortMSB(s, (uInt)(strm->adler & 0xffff));
+ }
+ strm->adler = adler32(0L, Z_NULL, 0);
+ }
+ }
+#ifdef GZIP
+ if (s->status == EXTRA_STATE) {
+ if (s->gzhead->extra != NULL) {
+ uInt beg = s->pending; /* start of bytes to update crc */
+
+ while (s->gzindex < (s->gzhead->extra_len & 0xffff)) {
+ if (s->pending == s->pending_buf_size) {
+ if (s->gzhead->hcrc && s->pending > beg)
+ strm->adler = crc32(strm->adler, s->pending_buf + beg,
+ s->pending - beg);
+ flush_pending(strm);
+ beg = s->pending;
+ if (s->pending == s->pending_buf_size)
+ break;
+ }
+ put_byte(s, s->gzhead->extra[s->gzindex]);
+ s->gzindex++;
+ }
+ if (s->gzhead->hcrc && s->pending > beg)
+ strm->adler = crc32(strm->adler, s->pending_buf + beg,
+ s->pending - beg);
+ if (s->gzindex == s->gzhead->extra_len) {
+ s->gzindex = 0;
+ s->status = NAME_STATE;
+ }
+ }
+ else
+ s->status = NAME_STATE;
+ }
+ if (s->status == NAME_STATE) {
+ if (s->gzhead->name != NULL) {
+ uInt beg = s->pending; /* start of bytes to update crc */
+ int val;
+
+ do {
+ if (s->pending == s->pending_buf_size) {
+ if (s->gzhead->hcrc && s->pending > beg)
+ strm->adler = crc32(strm->adler, s->pending_buf + beg,
+ s->pending - beg);
+ flush_pending(strm);
+ beg = s->pending;
+ if (s->pending == s->pending_buf_size) {
+ val = 1;
+ break;
+ }
+ }
+ val = s->gzhead->name[s->gzindex++];
+ put_byte(s, val);
+ } while (val != 0);
+ if (s->gzhead->hcrc && s->pending > beg)
+ strm->adler = crc32(strm->adler, s->pending_buf + beg,
+ s->pending - beg);
+ if (val == 0) {
+ s->gzindex = 0;
+ s->status = COMMENT_STATE;
+ }
+ }
+ else
+ s->status = COMMENT_STATE;
+ }
+ if (s->status == COMMENT_STATE) {
+ if (s->gzhead->comment != NULL) {
+ uInt beg = s->pending; /* start of bytes to update crc */
+ int val;
+
+ do {
+ if (s->pending == s->pending_buf_size) {
+ if (s->gzhead->hcrc && s->pending > beg)
+ strm->adler = crc32(strm->adler, s->pending_buf + beg,
+ s->pending - beg);
+ flush_pending(strm);
+ beg = s->pending;
+ if (s->pending == s->pending_buf_size) {
+ val = 1;
+ break;
+ }
+ }
+ val = s->gzhead->comment[s->gzindex++];
+ put_byte(s, val);
+ } while (val != 0);
+ if (s->gzhead->hcrc && s->pending > beg)
+ strm->adler = crc32(strm->adler, s->pending_buf + beg,
+ s->pending - beg);
+ if (val == 0)
+ s->status = HCRC_STATE;
+ }
+ else
+ s->status = HCRC_STATE;
+ }
+ if (s->status == HCRC_STATE) {
+ if (s->gzhead->hcrc) {
+ if (s->pending + 2 > s->pending_buf_size)
+ flush_pending(strm);
+ if (s->pending + 2 <= s->pending_buf_size) {
+ put_byte(s, (Byte)(strm->adler & 0xff));
+ put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
+ strm->adler = crc32(0L, Z_NULL, 0);
+ s->status = BUSY_STATE;
+ }
+ }
+ else
+ s->status = BUSY_STATE;
+ }
+#endif
+
+ /* Flush as much pending output as possible */
+ if (s->pending != 0) {
+ flush_pending(strm);
+ if (strm->avail_out == 0) {
+ /* Since avail_out is 0, deflate will be called again with
+ * more output space, but possibly with both pending and
+ * avail_in equal to zero. There won't be anything to do,
+ * but this is not an error situation so make sure we
+ * return OK instead of BUF_ERROR at next call of deflate:
+ */
+ s->last_flush = -1;
+ return Z_OK;
+ }
+
+ /* Make sure there is something to do and avoid duplicate consecutive
+ * flushes. For repeated and useless calls with Z_FINISH, we keep
+ * returning Z_STREAM_END instead of Z_BUF_ERROR.
+ */
+ } else if (strm->avail_in == 0 && flush <= old_flush &&
+ flush != Z_FINISH) {
+ ERR_RETURN(strm, Z_BUF_ERROR);
+ }
+
+ /* User must not provide more input after the first FINISH: */
+ if (s->status == FINISH_STATE && strm->avail_in != 0) {
+ ERR_RETURN(strm, Z_BUF_ERROR);
+ }
+
+ /* Start a new block or continue the current one.
+ */
+ if (strm->avail_in != 0 || s->lookahead != 0 ||
+ (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
+ block_state bstate;
+
+ bstate = (*(configuration_table[s->level].func))(s, flush);
+
+ if (bstate == finish_started || bstate == finish_done) {
+ s->status = FINISH_STATE;
+ }
+ if (bstate == need_more || bstate == finish_started) {
+ if (strm->avail_out == 0) {
+ s->last_flush = -1; /* avoid BUF_ERROR next call, see above */
+ }
+ return Z_OK;
+ /* If flush != Z_NO_FLUSH && avail_out == 0, the next call
+ * of deflate should use the same flush parameter to make sure
+ * that the flush is complete. So we don't have to output an
+ * empty block here, this will be done at next call. This also
+ * ensures that for a very small output buffer, we emit at most
+ * one empty block.
+ */
+ }
+ if (bstate == block_done) {
+ if (flush == Z_PARTIAL_FLUSH) {
+ _tr_align(s);
+ } else { /* FULL_FLUSH or SYNC_FLUSH */
+ _tr_stored_block(s, (char*)0, 0L, 0);
+ /* For a full flush, this empty block will be recognized
+ * as a special marker by inflate_sync().
+ */
+ if (flush == Z_FULL_FLUSH) {
+ CLEAR_HASH(s); /* forget history */
+ }
+ }
+ flush_pending(strm);
+ if (strm->avail_out == 0) {
+ s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */
+ return Z_OK;
+ }
+ }
+ }
+ Assert(strm->avail_out > 0, "bug2");
+
+ if (flush != Z_FINISH) return Z_OK;
+ if (s->wrap <= 0) return Z_STREAM_END;
+
+ /* Write the trailer */
+#ifdef GZIP
+ if (s->wrap == 2) {
+ put_byte(s, (Byte)(strm->adler & 0xff));
+ put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
+ put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
+ put_byte(s, (Byte)((strm->adler >> 24) & 0xff));
+ put_byte(s, (Byte)(strm->total_in & 0xff));
+ put_byte(s, (Byte)((strm->total_in >> 8) & 0xff));
+ put_byte(s, (Byte)((strm->total_in >> 16) & 0xff));
+ put_byte(s, (Byte)((strm->total_in >> 24) & 0xff));
+ }
+ else
+#endif
+ {
+ putShortMSB(s, (uInt)(strm->adler >> 16));
+ putShortMSB(s, (uInt)(strm->adler & 0xffff));
+ }
+ flush_pending(strm);
+ /* If avail_out is zero, the application will call deflate again
+ * to flush the rest.
+ */
+ if (s->wrap > 0) s->wrap = -s->wrap; /* write the trailer only once! */
+ return s->pending != 0 ? Z_OK : Z_STREAM_END;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateEnd (strm)
+ z_streamp strm;
+{
+ int status;
+
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+
+ status = strm->state->status;
+ if (status != INIT_STATE &&
+ status != EXTRA_STATE &&
+ status != NAME_STATE &&
+ status != COMMENT_STATE &&
+ status != HCRC_STATE &&
+ status != BUSY_STATE &&
+ status != FINISH_STATE) {
+ return Z_STREAM_ERROR;
+ }
+
+ /* Deallocate in reverse order of allocations: */
+ TRY_FREE(strm, strm->state->pending_buf);
+ TRY_FREE(strm, strm->state->head);
+ TRY_FREE(strm, strm->state->prev);
+ TRY_FREE(strm, strm->state->window);
+
+ ZFREE(strm, strm->state);
+ strm->state = Z_NULL;
+
+ return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
+}
+
+/* =========================================================================
+ * Copy the source state to the destination state.
+ * To simplify the source, this is not supported for 16-bit MSDOS (which
+ * doesn't have enough memory anyway to duplicate compression states).
+ */
+int ZEXPORT deflateCopy (dest, source)
+ z_streamp dest;
+ z_streamp source;
+{
+#ifdef MAXSEG_64K
+ return Z_STREAM_ERROR;
+#else
+ deflate_state *ds;
+ deflate_state *ss;
+ ushf *overlay;
+
+
+ if (source == Z_NULL || dest == Z_NULL || source->state == Z_NULL) {
+ return Z_STREAM_ERROR;
+ }
+
+ ss = source->state;
+
+ zmemcpy(dest, source, sizeof(z_stream));
+
+ ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state));
+ if (ds == Z_NULL) return Z_MEM_ERROR;
+ dest->state = (struct internal_state FAR *) ds;
+ zmemcpy(ds, ss, sizeof(deflate_state));
+ ds->strm = dest;
+
+ ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte));
+ ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos));
+ ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos));
+ overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2);
+ ds->pending_buf = (uchf *) overlay;
+
+ if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL ||
+ ds->pending_buf == Z_NULL) {
+ deflateEnd (dest);
+ return Z_MEM_ERROR;
+ }
+ /* following zmemcpy do not work for 16-bit MSDOS */
+ zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte));
+ zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos));
+ zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos));
+ zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
+
+ ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+ ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush);
+ ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize;
+
+ ds->l_desc.dyn_tree = ds->dyn_ltree;
+ ds->d_desc.dyn_tree = ds->dyn_dtree;
+ ds->bl_desc.dyn_tree = ds->bl_tree;
+
+ return Z_OK;
+#endif /* MAXSEG_64K */
+}
+
+/* ===========================================================================
+ * Read a new buffer from the current input stream, update the adler32
+ * and total number of bytes read. All deflate() input goes through
+ * this function so some applications may wish to modify it to avoid
+ * allocating a large strm->next_in buffer and copying from it.
+ * (See also flush_pending()).
+ */
+local int read_buf(strm, buf, size)
+ z_streamp strm;
+ Bytef *buf;
+ unsigned size;
+{
+ unsigned len = strm->avail_in;
+
+ if (len > size) len = size;
+ if (len == 0) return 0;
+
+ strm->avail_in -= len;
+
+ if (strm->state->wrap == 1) {
+ strm->adler = adler32(strm->adler, strm->next_in, len);
+ }
+#ifdef GZIP
+ else if (strm->state->wrap == 2) {
+ strm->adler = crc32(strm->adler, strm->next_in, len);
+ }
+#endif
+ zmemcpy(buf, strm->next_in, len);
+ strm->next_in += len;
+ strm->total_in += len;
+
+ return (int)len;
+}
+
+/* ===========================================================================
+ * Initialize the "longest match" routines for a new zlib stream
+ */
+local void lm_init (s)
+ deflate_state *s;
+{
+ s->window_size = (ulg)2L*s->w_size;
+
+ CLEAR_HASH(s);
+
+ /* Set the default configuration parameters:
+ */
+ s->max_lazy_match = configuration_table[s->level].max_lazy;
+ s->good_match = configuration_table[s->level].good_length;
+ s->nice_match = configuration_table[s->level].nice_length;
+ s->max_chain_length = configuration_table[s->level].max_chain;
+
+ s->strstart = 0;
+ s->block_start = 0L;
+ s->lookahead = 0;
+ s->match_length = s->prev_length = MIN_MATCH-1;
+ s->match_available = 0;
+ s->ins_h = 0;
+#ifndef FASTEST
+#ifdef ASMV
+ match_init(); /* initialize the asm code */
+#endif
+#endif
+}
+
+#ifndef FASTEST
+/* ===========================================================================
+ * Set match_start to the longest match starting at the given string and
+ * return its length. Matches shorter or equal to prev_length are discarded,
+ * in which case the result is equal to prev_length and match_start is
+ * garbage.
+ * IN assertions: cur_match is the head of the hash chain for the current
+ * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
+ * OUT assertion: the match length is not greater than s->lookahead.
+ */
+#ifndef ASMV
+/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
+ * match.S. The code will be functionally equivalent.
+ */
+local uInt longest_match(s, cur_match)
+ deflate_state *s;
+ IPos cur_match; /* current match */
+{
+ unsigned chain_length = s->max_chain_length;/* max hash chain length */
+ register Bytef *scan = s->window + s->strstart; /* current string */
+ register Bytef *match; /* matched string */
+ register int len; /* length of current match */
+ int best_len = s->prev_length; /* best match length so far */
+ int nice_match = s->nice_match; /* stop if match long enough */
+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+ s->strstart - (IPos)MAX_DIST(s) : NIL;
+ /* Stop when cur_match becomes <= limit. To simplify the code,
+ * we prevent matches with the string of window index 0.
+ */
+ Posf *prev = s->prev;
+ uInt wmask = s->w_mask;
+
+#ifdef UNALIGNED_OK
+ /* Compare two bytes at a time. Note: this is not always beneficial.
+ * Try with and without -DUNALIGNED_OK to check.
+ */
+ register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
+ register ush scan_start = *(ushf*)scan;
+ register ush scan_end = *(ushf*)(scan+best_len-1);
+#else
+ register Bytef *strend = s->window + s->strstart + MAX_MATCH;
+ register Byte scan_end1 = scan[best_len-1];
+ register Byte scan_end = scan[best_len];
+#endif
+
+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+ * It is easy to get rid of this optimization if necessary.
+ */
+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+ /* Do not waste too much time if we already have a good match: */
+ if (s->prev_length >= s->good_match) {
+ chain_length >>= 2;
+ }
+ /* Do not look for matches beyond the end of the input. This is necessary
+ * to make deflate deterministic.
+ */
+ if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+ do {
+ Assert(cur_match < s->strstart, "no future");
+ match = s->window + cur_match;
+
+ /* Skip to next match if the match length cannot increase
+ * or if the match length is less than 2. Note that the checks below
+ * for insufficient lookahead only occur occasionally for performance
+ * reasons. Therefore uninitialized memory will be accessed, and
+ * conditional jumps will be made that depend on those values.
+ * However the length of the match is limited to the lookahead, so
+ * the output of deflate is not affected by the uninitialized values.
+ */
+#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
+ /* This code assumes sizeof(unsigned short) == 2. Do not use
+ * UNALIGNED_OK if your compiler uses a different size.
+ */
+ if (*(ushf*)(match+best_len-1) != scan_end ||
+ *(ushf*)match != scan_start) continue;
+
+ /* It is not necessary to compare scan[2] and match[2] since they are
+ * always equal when the other bytes match, given that the hash keys
+ * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
+ * strstart+3, +5, ... up to strstart+257. We check for insufficient
+ * lookahead only every 4th comparison; the 128th check will be made
+ * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
+ * necessary to put more guard bytes at the end of the window, or
+ * to check more often for insufficient lookahead.
+ */
+ Assert(scan[2] == match[2], "scan[2]?");
+ scan++, match++;
+ do {
+ } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ scan < strend);
+ /* The funny "do {}" generates better code on most compilers */
+
+ /* Here, scan <= window+strstart+257 */
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+ if (*scan == *match) scan++;
+
+ len = (MAX_MATCH - 1) - (int)(strend-scan);
+ scan = strend - (MAX_MATCH-1);
+
+#else /* UNALIGNED_OK */
+
+ if (match[best_len] != scan_end ||
+ match[best_len-1] != scan_end1 ||
+ *match != *scan ||
+ *++match != scan[1]) continue;
+
+ /* The check at best_len-1 can be removed because it will be made
+ * again later. (This heuristic is not always a win.)
+ * It is not necessary to compare scan[2] and match[2] since they
+ * are always equal when the other bytes match, given that
+ * the hash keys are equal and that HASH_BITS >= 8.
+ */
+ scan += 2, match++;
+ Assert(*scan == *match, "match[2]?");
+
+ /* We check for insufficient lookahead only every 8th comparison;
+ * the 256th check will be made at strstart+258.
+ */
+ do {
+ } while (*++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ scan < strend);
+
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+ len = MAX_MATCH - (int)(strend - scan);
+ scan = strend - MAX_MATCH;
+
+#endif /* UNALIGNED_OK */
+
+ if (len > best_len) {
+ s->match_start = cur_match;
+ best_len = len;
+ if (len >= nice_match) break;
+#ifdef UNALIGNED_OK
+ scan_end = *(ushf*)(scan+best_len-1);
+#else
+ scan_end1 = scan[best_len-1];
+ scan_end = scan[best_len];
+#endif
+ }
+ } while ((cur_match = prev[cur_match & wmask]) > limit
+ && --chain_length != 0);
+
+ if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
+ return s->lookahead;
+}
+#endif /* ASMV */
+#endif /* FASTEST */
+
+/* ---------------------------------------------------------------------------
+ * Optimized version for level == 1 or strategy == Z_RLE only
+ */
+local uInt longest_match_fast(s, cur_match)
+ deflate_state *s;
+ IPos cur_match; /* current match */
+{
+ register Bytef *scan = s->window + s->strstart; /* current string */
+ register Bytef *match; /* matched string */
+ register int len; /* length of current match */
+ register Bytef *strend = s->window + s->strstart + MAX_MATCH;
+
+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+ * It is easy to get rid of this optimization if necessary.
+ */
+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+ Assert(cur_match < s->strstart, "no future");
+
+ match = s->window + cur_match;
+
+ /* Return failure if the match length is less than 2:
+ */
+ if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;
+
+ /* The check at best_len-1 can be removed because it will be made
+ * again later. (This heuristic is not always a win.)
+ * It is not necessary to compare scan[2] and match[2] since they
+ * are always equal when the other bytes match, given that
+ * the hash keys are equal and that HASH_BITS >= 8.
+ */
+ scan += 2, match += 2;
+ Assert(*scan == *match, "match[2]?");
+
+ /* We check for insufficient lookahead only every 8th comparison;
+ * the 256th check will be made at strstart+258.
+ */
+ do {
+ } while (*++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ scan < strend);
+
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+ len = MAX_MATCH - (int)(strend - scan);
+
+ if (len < MIN_MATCH) return MIN_MATCH - 1;
+
+ s->match_start = cur_match;
+ return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead;
+}
+
+#ifdef DEBUG
+/* ===========================================================================
+ * Check that the match at match_start is indeed a match.
+ */
+local void check_match(s, start, match, length)
+ deflate_state *s;
+ IPos start, match;
+ int length;
+{
+ /* check that the match is indeed a match */
+ if (zmemcmp(s->window + match,
+ s->window + start, length) != EQUAL) {
+ fprintf(stderr, " start %u, match %u, length %d\n",
+ start, match, length);
+ do {
+ fprintf(stderr, "%c%c", s->window[match++], s->window[start++]);
+ } while (--length != 0);
+ z_error("invalid match");
+ }
+ if (z_verbose > 1) {
+ fprintf(stderr,"\\[%d,%d]", start-match, length);
+ do { putc(s->window[start++], stderr); } while (--length != 0);
+ }
+}
+#else
+# define check_match(s, start, match, length)
+#endif /* DEBUG */
+
+/* ===========================================================================
+ * Fill the window when the lookahead becomes insufficient.
+ * Updates strstart and lookahead.
+ *
+ * IN assertion: lookahead < MIN_LOOKAHEAD
+ * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
+ * At least one byte has been read, or avail_in == 0; reads are
+ * performed for at least two bytes (required for the zip translate_eol
+ * option -- not supported here).
+ */
+local void fill_window(s)
+ deflate_state *s;
+{
+ register unsigned n, m;
+ register Posf *p;
+ unsigned more; /* Amount of free space at the end of the window. */
+ uInt wsize = s->w_size;
+
+ do {
+ more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+ /* Deal with !@#$% 64K limit: */
+ if (sizeof(int) <= 2) {
+ if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+ more = wsize;
+
+ } else if (more == (unsigned)(-1)) {
+ /* Very unlikely, but possible on 16 bit machine if
+ * strstart == 0 && lookahead == 1 (input done a byte at time)
+ */
+ more--;
+ }
+ }
+
+ /* If the window is almost full and there is insufficient lookahead,
+ * move the upper half to the lower one to make room in the upper half.
+ */
+ if (s->strstart >= wsize+MAX_DIST(s)) {
+
+ zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
+ s->match_start -= wsize;
+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
+ s->block_start -= (long) wsize;
+
+ /* Slide the hash table (could be avoided with 32 bit values
+ at the expense of memory usage). We slide even when level == 0
+ to keep the hash table consistent if we switch back to level > 0
+ later. (Using level 0 permanently is not an optimal usage of
+ zlib, so we don't care about this pathological case.)
+ */
+ /* %%% avoid this when Z_RLE */
+ n = s->hash_size;
+ p = &s->head[n];
+ do {
+ m = *--p;
+ *p = (Pos)(m >= wsize ? m-wsize : NIL);
+ } while (--n);
+
+ n = wsize;
+#ifndef FASTEST
+ p = &s->prev[n];
+ do {
+ m = *--p;
+ *p = (Pos)(m >= wsize ? m-wsize : NIL);
+ /* If n is not on any hash chain, prev[n] is garbage but
+ * its value will never be used.
+ */
+ } while (--n);
+#endif
+ more += wsize;
+ }
+ if (s->strm->avail_in == 0) return;
+
+ /* If there was no sliding:
+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+ * more == window_size - lookahead - strstart
+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+ * => more >= window_size - 2*WSIZE + 2
+ * In the BIG_MEM or MMAP case (not yet supported),
+ * window_size == input_size + MIN_LOOKAHEAD &&
+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+ * Otherwise, window_size == 2*WSIZE so more >= 2.
+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+ */
+ Assert(more >= 2, "more < 2");
+
+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+ s->lookahead += n;
+
+ /* Initialize the hash value now that we have some input: */
+ if (s->lookahead >= MIN_MATCH) {
+ s->ins_h = s->window[s->strstart];
+ UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+#if MIN_MATCH != 3
+ Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+ }
+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+ * but this is not important since only literal bytes will be emitted.
+ */
+
+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+}
+
+/* ===========================================================================
+ * Flush the current block, with given end-of-file flag.
+ * IN assertion: strstart is set to the end of the current match.
+ */
+#define FLUSH_BLOCK_ONLY(s, eof) { \
+ _tr_flush_block(s, (s->block_start >= 0L ? \
+ (charf *)&s->window[(unsigned)s->block_start] : \
+ (charf *)Z_NULL), \
+ (ulg)((long)s->strstart - s->block_start), \
+ (eof)); \
+ s->block_start = s->strstart; \
+ flush_pending(s->strm); \
+ Tracev((stderr,"[FLUSH]")); \
+}
+
+/* Same but force premature exit if necessary. */
+#define FLUSH_BLOCK(s, eof) { \
+ FLUSH_BLOCK_ONLY(s, eof); \
+ if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \
+}
+
+/* ===========================================================================
+ * Copy without compression as much as possible from the input stream, return
+ * the current block state.
+ * This function does not insert new strings in the dictionary since
+ * uncompressible data is probably not useful. This function is used
+ * only for the level=0 compression option.
+ * NOTE: this function should be optimized to avoid extra copying from
+ * window to pending_buf.
+ */
+local block_state deflate_stored(s, flush)
+ deflate_state *s;
+ int flush;
+{
+ /* Stored blocks are limited to 0xffff bytes, pending_buf is limited
+ * to pending_buf_size, and each stored block has a 5 byte header:
+ */
+ ulg max_block_size = 0xffff;
+ ulg max_start;
+
+ if (max_block_size > s->pending_buf_size - 5) {
+ max_block_size = s->pending_buf_size - 5;
+ }
+
+ /* Copy as much as possible from input to output: */
+ for (;;) {
+ /* Fill the window as much as possible: */
+ if (s->lookahead <= 1) {
+
+ Assert(s->strstart < s->w_size+MAX_DIST(s) ||
+ s->block_start >= (long)s->w_size, "slide too late");
+
+ fill_window(s);
+ if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more;
+
+ if (s->lookahead == 0) break; /* flush the current block */
+ }
+ Assert(s->block_start >= 0L, "block gone");
+
+ s->strstart += s->lookahead;
+ s->lookahead = 0;
+
+ /* Emit a stored block if pending_buf will be full: */
+ max_start = s->block_start + max_block_size;
+ if (s->strstart == 0 || (ulg)s->strstart >= max_start) {
+ /* strstart == 0 is possible when wraparound on 16-bit machine */
+ s->lookahead = (uInt)(s->strstart - max_start);
+ s->strstart = (uInt)max_start;
+ FLUSH_BLOCK(s, 0);
+ }
+ /* Flush if we may have to slide, otherwise block_start may become
+ * negative and the data will be gone:
+ */
+ if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) {
+ FLUSH_BLOCK(s, 0);
+ }
+ }
+ FLUSH_BLOCK(s, flush == Z_FINISH);
+ return flush == Z_FINISH ? finish_done : block_done;
+}
+
+/* ===========================================================================
+ * Compress as much as possible from the input stream, return the current
+ * block state.
+ * This function does not perform lazy evaluation of matches and inserts
+ * new strings in the dictionary only for unmatched strings or for short
+ * matches. It is used only for the fast compression options.
+ */
+local block_state deflate_fast(s, flush)
+ deflate_state *s;
+ int flush;
+{
+ IPos hash_head = NIL; /* head of the hash chain */
+ int bflush; /* set if current block must be flushed */
+
+ for (;;) {
+ /* Make sure that we always have enough lookahead, except
+ * at the end of the input file. We need MAX_MATCH bytes
+ * for the next match, plus MIN_MATCH bytes to insert the
+ * string following the next match.
+ */
+ if (s->lookahead < MIN_LOOKAHEAD) {
+ fill_window(s);
+ if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+ return need_more;
+ }
+ if (s->lookahead == 0) break; /* flush the current block */
+ }
+
+ /* Insert the string window[strstart .. strstart+2] in the
+ * dictionary, and set hash_head to the head of the hash chain:
+ */
+ if (s->lookahead >= MIN_MATCH) {
+ INSERT_STRING(s, s->strstart, hash_head);
+ }
+
+ /* Find the longest match, discarding those <= prev_length.
+ * At this point we have always match_length < MIN_MATCH
+ */
+ if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) {
+ /* To simplify the code, we prevent matches with the string
+ * of window index 0 (in particular we have to avoid a match
+ * of the string with itself at the start of the input file).
+ */
+#ifdef FASTEST
+ if ((s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) ||
+ (s->strategy == Z_RLE && s->strstart - hash_head == 1)) {
+ s->match_length = longest_match_fast (s, hash_head);
+ }
+#else
+ if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) {
+ s->match_length = longest_match (s, hash_head);
+ } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) {
+ s->match_length = longest_match_fast (s, hash_head);
+ }
+#endif
+ /* longest_match() or longest_match_fast() sets match_start */
+ }
+ if (s->match_length >= MIN_MATCH) {
+ check_match(s, s->strstart, s->match_start, s->match_length);
+
+ _tr_tally_dist(s, s->strstart - s->match_start,
+ s->match_length - MIN_MATCH, bflush);
+
+ s->lookahead -= s->match_length;
+
+ /* Insert new strings in the hash table only if the match length
+ * is not too large. This saves time but degrades compression.
+ */
+#ifndef FASTEST
+ if (s->match_length <= s->max_insert_length &&
+ s->lookahead >= MIN_MATCH) {
+ s->match_length--; /* string at strstart already in table */
+ do {
+ s->strstart++;
+ INSERT_STRING(s, s->strstart, hash_head);
+ /* strstart never exceeds WSIZE-MAX_MATCH, so there are
+ * always MIN_MATCH bytes ahead.
+ */
+ } while (--s->match_length != 0);
+ s->strstart++;
+ } else
+#endif
+ {
+ s->strstart += s->match_length;
+ s->match_length = 0;
+ s->ins_h = s->window[s->strstart];
+ UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+#if MIN_MATCH != 3
+ Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+ /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
+ * matter since it will be recomputed at next deflate call.
+ */
+ }
+ } else {
+ /* No match, output a literal byte */
+ Tracevv((stderr,"%c", s->window[s->strstart]));
+ _tr_tally_lit (s, s->window[s->strstart], bflush);
+ s->lookahead--;
+ s->strstart++;
+ }
+ if (bflush) FLUSH_BLOCK(s, 0);
+ }
+ FLUSH_BLOCK(s, flush == Z_FINISH);
+ return flush == Z_FINISH ? finish_done : block_done;
+}
+
+#ifndef FASTEST
+/* ===========================================================================
+ * Same as above, but achieves better compression. We use a lazy
+ * evaluation for matches: a match is finally adopted only if there is
+ * no better match at the next window position.
+ */
+local block_state deflate_slow(s, flush)
+ deflate_state *s;
+ int flush;
+{
+ IPos hash_head = NIL; /* head of hash chain */
+ int bflush; /* set if current block must be flushed */
+
+ /* Process the input block. */
+ for (;;) {
+ /* Make sure that we always have enough lookahead, except
+ * at the end of the input file. We need MAX_MATCH bytes
+ * for the next match, plus MIN_MATCH bytes to insert the
+ * string following the next match.
+ */
+ if (s->lookahead < MIN_LOOKAHEAD) {
+ fill_window(s);
+ if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+ return need_more;
+ }
+ if (s->lookahead == 0) break; /* flush the current block */
+ }
+
+ /* Insert the string window[strstart .. strstart+2] in the
+ * dictionary, and set hash_head to the head of the hash chain:
+ */
+ if (s->lookahead >= MIN_MATCH) {
+ INSERT_STRING(s, s->strstart, hash_head);
+ }
+
+ /* Find the longest match, discarding those <= prev_length.
+ */
+ s->prev_length = s->match_length, s->prev_match = s->match_start;
+ s->match_length = MIN_MATCH-1;
+
+ if (hash_head != NIL && s->prev_length < s->max_lazy_match &&
+ s->strstart - hash_head <= MAX_DIST(s)) {
+ /* To simplify the code, we prevent matches with the string
+ * of window index 0 (in particular we have to avoid a match
+ * of the string with itself at the start of the input file).
+ */
+ if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) {
+ s->match_length = longest_match (s, hash_head);
+ } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) {
+ s->match_length = longest_match_fast (s, hash_head);
+ }
+ /* longest_match() or longest_match_fast() sets match_start */
+
+ if (s->match_length <= 5 && (s->strategy == Z_FILTERED
+#if TOO_FAR <= 32767
+ || (s->match_length == MIN_MATCH &&
+ s->strstart - s->match_start > TOO_FAR)
+#endif
+ )) {
+
+ /* If prev_match is also MIN_MATCH, match_start is garbage
+ * but we will ignore the current match anyway.
+ */
+ s->match_length = MIN_MATCH-1;
+ }
+ }
+ /* If there was a match at the previous step and the current
+ * match is not better, output the previous match:
+ */
+ if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) {
+ uInt max_insert = s->strstart + s->lookahead - MIN_MATCH;
+ /* Do not insert strings in hash table beyond this. */
+
+ check_match(s, s->strstart-1, s->prev_match, s->prev_length);
+
+ _tr_tally_dist(s, s->strstart -1 - s->prev_match,
+ s->prev_length - MIN_MATCH, bflush);
+
+ /* Insert in hash table all strings up to the end of the match.
+ * strstart-1 and strstart are already inserted. If there is not
+ * enough lookahead, the last two strings are not inserted in
+ * the hash table.
+ */
+ s->lookahead -= s->prev_length-1;
+ s->prev_length -= 2;
+ do {
+ if (++s->strstart <= max_insert) {
+ INSERT_STRING(s, s->strstart, hash_head);
+ }
+ } while (--s->prev_length != 0);
+ s->match_available = 0;
+ s->match_length = MIN_MATCH-1;
+ s->strstart++;
+
+ if (bflush) FLUSH_BLOCK(s, 0);
+
+ } else if (s->match_available) {
+ /* If there was no match at the previous position, output a
+ * single literal. If there was a match but the current match
+ * is longer, truncate the previous match to a single literal.
+ */
+ Tracevv((stderr,"%c", s->window[s->strstart-1]));
+ _tr_tally_lit(s, s->window[s->strstart-1], bflush);
+ if (bflush) {
+ FLUSH_BLOCK_ONLY(s, 0);
+ }
+ s->strstart++;
+ s->lookahead--;
+ if (s->strm->avail_out == 0) return need_more;
+ } else {
+ /* There is no previous match to compare with, wait for
+ * the next step to decide.
+ */
+ s->match_available = 1;
+ s->strstart++;
+ s->lookahead--;
+ }
+ }
+ Assert (flush != Z_NO_FLUSH, "no flush?");
+ if (s->match_available) {
+ Tracevv((stderr,"%c", s->window[s->strstart-1]));
+ _tr_tally_lit(s, s->window[s->strstart-1], bflush);
+ s->match_available = 0;
+ }
+ FLUSH_BLOCK(s, flush == Z_FINISH);
+ return flush == Z_FINISH ? finish_done : block_done;
+}
+#endif /* FASTEST */
+
+#if 0
+/* ===========================================================================
+ * For Z_RLE, simply look for runs of bytes, generate matches only of distance
+ * one. Do not maintain a hash table. (It will be regenerated if this run of
+ * deflate switches away from Z_RLE.)
+ */
+local block_state deflate_rle(s, flush)
+ deflate_state *s;
+ int flush;
+{
+ int bflush; /* set if current block must be flushed */
+ uInt run; /* length of run */
+ uInt max; /* maximum length of run */
+ uInt prev; /* byte at distance one to match */
+ Bytef *scan; /* scan for end of run */
+
+ for (;;) {
+ /* Make sure that we always have enough lookahead, except
+ * at the end of the input file. We need MAX_MATCH bytes
+ * for the longest encodable run.
+ */
+ if (s->lookahead < MAX_MATCH) {
+ fill_window(s);
+ if (s->lookahead < MAX_MATCH && flush == Z_NO_FLUSH) {
+ return need_more;
+ }
+ if (s->lookahead == 0) break; /* flush the current block */
+ }
+
+ /* See how many times the previous byte repeats */
+ run = 0;
+ if (s->strstart > 0) { /* if there is a previous byte, that is */
+ max = s->lookahead < MAX_MATCH ? s->lookahead : MAX_MATCH;
+ scan = s->window + s->strstart - 1;
+ prev = *scan++;
+ do {
+ if (*scan++ != prev)
+ break;
+ } while (++run < max);
+ }
+
+ /* Emit match if have run of MIN_MATCH or longer, else emit literal */
+ if (run >= MIN_MATCH) {
+ check_match(s, s->strstart, s->strstart - 1, run);
+ _tr_tally_dist(s, 1, run - MIN_MATCH, bflush);
+ s->lookahead -= run;
+ s->strstart += run;
+ } else {
+ /* No match, output a literal byte */
+ Tracevv((stderr,"%c", s->window[s->strstart]));
+ _tr_tally_lit (s, s->window[s->strstart], bflush);
+ s->lookahead--;
+ s->strstart++;
+ }
+ if (bflush) FLUSH_BLOCK(s, 0);
+ }
+ FLUSH_BLOCK(s, flush == Z_FINISH);
+ return flush == Z_FINISH ? finish_done : block_done;
+}
+#endif
diff --git a/sys/contrib/opensolaris/uts/common/zmod/deflate.h b/sys/contrib/opensolaris/uts/common/zmod/deflate.h
new file mode 100644
index 000000000000..d01a3c10e449
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/deflate.h
@@ -0,0 +1,331 @@
+/* deflate.h -- internal compression state
+ * Copyright (C) 1995-2004 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+ part of the implementation of the compression library and is
+ subject to change. Applications should only use zlib.h.
+ */
+
+#ifndef _DEFLATE_H
+#define _DEFLATE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "zutil.h"
+
+/* define NO_GZIP when compiling if you want to disable gzip header and
+ trailer creation by deflate(). NO_GZIP would be used to avoid linking in
+ the crc code when it is not needed. For shared libraries, gzip encoding
+ should be left enabled. */
+#ifndef NO_GZIP
+# define GZIP
+#endif
+
+/* ===========================================================================
+ * Internal compression state.
+ */
+
+#define LENGTH_CODES 29
+/* number of length codes, not counting the special END_BLOCK code */
+
+#define LITERALS 256
+/* number of literal bytes 0..255 */
+
+#define L_CODES (LITERALS+1+LENGTH_CODES)
+/* number of Literal or Length codes, including the END_BLOCK code */
+
+#define D_CODES 30
+/* number of distance codes */
+
+#define BL_CODES 19
+/* number of codes used to transfer the bit lengths */
+
+#define HEAP_SIZE (2*L_CODES+1)
+/* maximum heap size */
+
+#define MAX_BITS 15
+/* All codes must not exceed MAX_BITS bits */
+
+#define INIT_STATE 42
+#define EXTRA_STATE 69
+#define NAME_STATE 73
+#define COMMENT_STATE 91
+#define HCRC_STATE 103
+#define BUSY_STATE 113
+#define FINISH_STATE 666
+/* Stream status */
+
+
+/* Data structure describing a single value and its code string. */
+typedef struct ct_data_s {
+ union {
+ ush freq; /* frequency count */
+ ush code; /* bit string */
+ } fc;
+ union {
+ ush dad; /* father node in Huffman tree */
+ ush len; /* length of bit string */
+ } dl;
+} FAR ct_data;
+
+#define Freq fc.freq
+#define Code fc.code
+#define Dad dl.dad
+#define Len dl.len
+
+typedef struct static_tree_desc_s static_tree_desc;
+
+typedef struct tree_desc_s {
+ ct_data *dyn_tree; /* the dynamic tree */
+ int max_code; /* largest code with non zero frequency */
+ static_tree_desc *stat_desc; /* the corresponding static tree */
+} FAR tree_desc;
+
+typedef ush Pos;
+typedef Pos FAR Posf;
+typedef unsigned IPos;
+
+/* A Pos is an index in the character window. We use short instead of int to
+ * save space in the various tables. IPos is used only for parameter passing.
+ */
+
+typedef struct internal_state {
+ z_streamp strm; /* pointer back to this zlib stream */
+ int status; /* as the name implies */
+ Bytef *pending_buf; /* output still pending */
+ ulg pending_buf_size; /* size of pending_buf */
+ Bytef *pending_out; /* next pending byte to output to the stream */
+ uInt pending; /* nb of bytes in the pending buffer */
+ int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
+ gz_headerp gzhead; /* gzip header information to write */
+ uInt gzindex; /* where in extra, name, or comment */
+ Byte method; /* STORED (for zip only) or DEFLATED */
+ int last_flush; /* value of flush param for previous deflate call */
+
+ /* used by deflate.c: */
+
+ uInt w_size; /* LZ77 window size (32K by default) */
+ uInt w_bits; /* log2(w_size) (8..16) */
+ uInt w_mask; /* w_size - 1 */
+
+ Bytef *window;
+ /* Sliding window. Input bytes are read into the second half of the window,
+ * and move to the first half later to keep a dictionary of at least wSize
+ * bytes. With this organization, matches are limited to a distance of
+ * wSize-MAX_MATCH bytes, but this ensures that IO is always
+ * performed with a length multiple of the block size. Also, it limits
+ * the window size to 64K, which is quite useful on MSDOS.
+ * To do: use the user input buffer as sliding window.
+ */
+
+ ulg window_size;
+ /* Actual size of window: 2*wSize, except when the user input buffer
+ * is directly used as sliding window.
+ */
+
+ Posf *prev;
+ /* Link to older string with same hash index. To limit the size of this
+ * array to 64K, this link is maintained only for the last 32K strings.
+ * An index in this array is thus a window index modulo 32K.
+ */
+
+ Posf *head; /* Heads of the hash chains or NIL. */
+
+ uInt ins_h; /* hash index of string to be inserted */
+ uInt hash_size; /* number of elements in hash table */
+ uInt hash_bits; /* log2(hash_size) */
+ uInt hash_mask; /* hash_size-1 */
+
+ uInt hash_shift;
+ /* Number of bits by which ins_h must be shifted at each input
+ * step. It must be such that after MIN_MATCH steps, the oldest
+ * byte no longer takes part in the hash key, that is:
+ * hash_shift * MIN_MATCH >= hash_bits
+ */
+
+ long block_start;
+ /* Window position at the beginning of the current output block. Gets
+ * negative when the window is moved backwards.
+ */
+
+ uInt match_length; /* length of best match */
+ IPos prev_match; /* previous match */
+ int match_available; /* set if previous match exists */
+ uInt strstart; /* start of string to insert */
+ uInt match_start; /* start of matching string */
+ uInt lookahead; /* number of valid bytes ahead in window */
+
+ uInt prev_length;
+ /* Length of the best match at previous step. Matches not greater than this
+ * are discarded. This is used in the lazy match evaluation.
+ */
+
+ uInt max_chain_length;
+ /* To speed up deflation, hash chains are never searched beyond this
+ * length. A higher limit improves compression ratio but degrades the
+ * speed.
+ */
+
+ uInt max_lazy_match;
+ /* Attempt to find a better match only when the current match is strictly
+ * smaller than this value. This mechanism is used only for compression
+ * levels >= 4.
+ */
+# define max_insert_length max_lazy_match
+ /* Insert new strings in the hash table only if the match length is not
+ * greater than this length. This saves time but degrades compression.
+ * max_insert_length is used only for compression levels <= 3.
+ */
+
+ int level; /* compression level (1..9) */
+ int strategy; /* favor or force Huffman coding*/
+
+ uInt good_match;
+ /* Use a faster search when the previous match is longer than this */
+
+ int nice_match; /* Stop searching when current match exceeds this */
+
+ /* used by trees.c: */
+ /* Didn't use ct_data typedef below to supress compiler warning */
+ struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */
+ struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
+ struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */
+
+ struct tree_desc_s l_desc; /* desc. for literal tree */
+ struct tree_desc_s d_desc; /* desc. for distance tree */
+ struct tree_desc_s bl_desc; /* desc. for bit length tree */
+
+ ush bl_count[MAX_BITS+1];
+ /* number of codes at each bit length for an optimal tree */
+
+ int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
+ int heap_len; /* number of elements in the heap */
+ int heap_max; /* element of largest frequency */
+ /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
+ * The same heap array is used to build all trees.
+ */
+
+ uch depth[2*L_CODES+1];
+ /* Depth of each subtree used as tie breaker for trees of equal frequency
+ */
+
+ uchf *l_buf; /* buffer for literals or lengths */
+
+ uInt lit_bufsize;
+ /* Size of match buffer for literals/lengths. There are 4 reasons for
+ * limiting lit_bufsize to 64K:
+ * - frequencies can be kept in 16 bit counters
+ * - if compression is not successful for the first block, all input
+ * data is still in the window so we can still emit a stored block even
+ * when input comes from standard input. (This can also be done for
+ * all blocks if lit_bufsize is not greater than 32K.)
+ * - if compression is not successful for a file smaller than 64K, we can
+ * even emit a stored file instead of a stored block (saving 5 bytes).
+ * This is applicable only for zip (not gzip or zlib).
+ * - creating new Huffman trees less frequently may not provide fast
+ * adaptation to changes in the input data statistics. (Take for
+ * example a binary file with poorly compressible code followed by
+ * a highly compressible string table.) Smaller buffer sizes give
+ * fast adaptation but have of course the overhead of transmitting
+ * trees more frequently.
+ * - I can't count above 4
+ */
+
+ uInt last_lit; /* running index in l_buf */
+
+ ushf *d_buf;
+ /* Buffer for distances. To simplify the code, d_buf and l_buf have
+ * the same number of elements. To use different lengths, an extra flag
+ * array would be necessary.
+ */
+
+ ulg opt_len; /* bit length of current block with optimal trees */
+ ulg static_len; /* bit length of current block with static trees */
+ uInt matches; /* number of string matches in current block */
+ int last_eob_len; /* bit length of EOB code for last block */
+
+#ifdef DEBUG
+ ulg compressed_len; /* total bit length of compressed file mod 2^32 */
+ ulg bits_sent; /* bit length of compressed data sent mod 2^32 */
+#endif
+
+ ush bi_buf;
+ /* Output buffer. bits are inserted starting at the bottom (least
+ * significant bits).
+ */
+ int bi_valid;
+ /* Number of valid bits in bi_buf. All bits above the last valid bit
+ * are always zero.
+ */
+
+} FAR deflate_state;
+
+/* Output a byte on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);}
+
+
+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the MIN_MATCH+1.
+ */
+
+#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD)
+/* In order to simplify the code, particularly on 16 bit machines, match
+ * distances are limited to MAX_DIST instead of WSIZE.
+ */
+
+ /* in trees.c */
+void _tr_init OF((deflate_state *s));
+int _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
+void _tr_flush_block OF((deflate_state *s, charf *buf, ulg stored_len,
+ int eof));
+void _tr_align OF((deflate_state *s));
+void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len,
+ int eof));
+
+#define d_code(dist) \
+ ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
+/* Mapping from a distance to a distance code. dist is the distance - 1 and
+ * must not have side effects. _dist_code[256] and _dist_code[257] are never
+ * used.
+ */
+
+#ifndef DEBUG
+/* Inline versions of _tr_tally for speed: */
+
+#if defined(GEN_TREES_H) || !defined(STDC)
+ extern uch _length_code[];
+ extern uch _dist_code[];
+#else
+ extern const uch _length_code[];
+ extern const uch _dist_code[];
+#endif
+
+# define _tr_tally_lit(s, c, flush) \
+ { uch cc = (c); \
+ s->d_buf[s->last_lit] = 0; \
+ s->l_buf[s->last_lit++] = cc; \
+ s->dyn_ltree[cc].Freq++; \
+ flush = (s->last_lit == s->lit_bufsize-1); \
+ }
+# define _tr_tally_dist(s, distance, length, flush) \
+ { uch len = (length); \
+ ush dist = (distance); \
+ s->d_buf[s->last_lit] = dist; \
+ s->l_buf[s->last_lit++] = len; \
+ dist--; \
+ s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
+ s->dyn_dtree[d_code(dist)].Freq++; \
+ flush = (s->last_lit == s->lit_bufsize-1); \
+ }
+#else
+# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
+# define _tr_tally_dist(s, distance, length, flush) \
+ flush = _tr_tally(s, distance, length)
+#endif
+
+#endif /* _DEFLATE_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffast.c b/sys/contrib/opensolaris/uts/common/zmod/inffast.c
new file mode 100644
index 000000000000..a6dcf3f5c8b8
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inffast.c
@@ -0,0 +1,320 @@
+/* inffast.c -- fast decoding
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+
+#ifndef ASMINF
+
+/* Allow machine dependent optimization for post-increment or pre-increment.
+ Based on testing to date,
+ Pre-increment preferred for:
+ - PowerPC G3 (Adler)
+ - MIPS R5000 (Randers-Pehrson)
+ Post-increment preferred for:
+ - none
+ No measurable difference:
+ - Pentium III (Anderson)
+ - M68060 (Nikl)
+ */
+#ifdef POSTINC
+# define OFF 0
+# define PUP(a) *(a)++
+#else
+# define OFF 1
+# define PUP(a) *++(a)
+#endif
+
+/*
+ Decode literal, length, and distance codes and write out the resulting
+ literal and match bytes until either not enough input or output is
+ available, an end-of-block is encountered, or a data error is encountered.
+ When large enough input and output buffers are supplied to inflate(), for
+ example, a 16K input buffer and a 64K output buffer, more than 95% of the
+ inflate execution time is spent in this routine.
+
+ Entry assumptions:
+
+ state->mode == LEN
+ strm->avail_in >= 6
+ strm->avail_out >= 258
+ start >= strm->avail_out
+ state->bits < 8
+
+ On return, state->mode is one of:
+
+ LEN -- ran out of enough output space or enough available input
+ TYPE -- reached end of block code, inflate() to interpret next block
+ BAD -- error in block data
+
+ Notes:
+
+ - The maximum input bits used by a length/distance pair is 15 bits for the
+ length code, 5 bits for the length extra, 15 bits for the distance code,
+ and 13 bits for the distance extra. This totals 48 bits, or six bytes.
+ Therefore if strm->avail_in >= 6, then there is enough input to avoid
+ checking for available input while decoding.
+
+ - The maximum bytes that a single length/distance pair can output is 258
+ bytes, which is the maximum length that can be coded. inflate_fast()
+ requires strm->avail_out >= 258 for each loop to avoid checking for
+ output space.
+ */
+void inflate_fast(strm, start)
+z_streamp strm;
+unsigned start; /* inflate()'s starting value for strm->avail_out */
+{
+ struct inflate_state FAR *state;
+ unsigned char FAR *in; /* local strm->next_in */
+ unsigned char FAR *last; /* while in < last, enough input available */
+ unsigned char FAR *out; /* local strm->next_out */
+ unsigned char FAR *beg; /* inflate()'s initial strm->next_out */
+ unsigned char FAR *end; /* while out < end, enough space available */
+#ifdef INFLATE_STRICT
+ unsigned dmax; /* maximum distance from zlib header */
+#endif
+ unsigned wsize; /* window size or zero if not using window */
+ unsigned whave; /* valid bytes in the window */
+ unsigned write; /* window write index */
+ unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
+ unsigned long hold; /* local strm->hold */
+ unsigned bits; /* local strm->bits */
+ code const FAR *lcode; /* local strm->lencode */
+ code const FAR *dcode; /* local strm->distcode */
+ unsigned lmask; /* mask for first level of length codes */
+ unsigned dmask; /* mask for first level of distance codes */
+ code this; /* retrieved table entry */
+ unsigned op; /* code bits, operation, extra bits, or */
+ /* window position, window bytes to copy */
+ unsigned len; /* match length, unused bytes */
+ unsigned dist; /* match distance */
+ unsigned char FAR *from; /* where to copy match from */
+
+ /* copy state to local variables */
+ state = (struct inflate_state FAR *)strm->state;
+ in = strm->next_in - OFF;
+ last = in + (strm->avail_in - 5);
+ out = strm->next_out - OFF;
+ beg = out - (start - strm->avail_out);
+ end = out + (strm->avail_out - 257);
+#ifdef INFLATE_STRICT
+ dmax = state->dmax;
+#endif
+ wsize = state->wsize;
+ whave = state->whave;
+ write = state->write;
+ window = state->window;
+ hold = state->hold;
+ bits = state->bits;
+ lcode = state->lencode;
+ dcode = state->distcode;
+ lmask = (1U << state->lenbits) - 1;
+ dmask = (1U << state->distbits) - 1;
+
+ /* decode literals and length/distances until end-of-block or not enough
+ input data or output space */
+ do {
+ if (bits < 15) {
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ }
+ this = lcode[hold & lmask];
+ dolen:
+ op = (unsigned)(this.bits);
+ hold >>= op;
+ bits -= op;
+ op = (unsigned)(this.op);
+ if (op == 0) { /* literal */
+ Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
+ "inflate: literal '%c'\n" :
+ "inflate: literal 0x%02x\n", this.val));
+ PUP(out) = (unsigned char)(this.val);
+ }
+ else if (op & 16) { /* length base */
+ len = (unsigned)(this.val);
+ op &= 15; /* number of extra bits */
+ if (op) {
+ if (bits < op) {
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ }
+ len += (unsigned)hold & ((1U << op) - 1);
+ hold >>= op;
+ bits -= op;
+ }
+ Tracevv((stderr, "inflate: length %u\n", len));
+ if (bits < 15) {
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ }
+ this = dcode[hold & dmask];
+ dodist:
+ op = (unsigned)(this.bits);
+ hold >>= op;
+ bits -= op;
+ op = (unsigned)(this.op);
+ if (op & 16) { /* distance base */
+ dist = (unsigned)(this.val);
+ op &= 15; /* number of extra bits */
+ if (bits < op) {
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ if (bits < op) {
+ hold += (unsigned long)(PUP(in)) << bits;
+ bits += 8;
+ }
+ }
+ dist += (unsigned)hold & ((1U << op) - 1);
+#ifdef INFLATE_STRICT
+ if (dist > dmax) {
+ strm->msg = (char *)"invalid distance too far back";
+ state->mode = BAD;
+ break;
+ }
+#endif
+ hold >>= op;
+ bits -= op;
+ Tracevv((stderr, "inflate: distance %u\n", dist));
+ op = (unsigned)(out - beg); /* max distance in output */
+ if (dist > op) { /* see if copy from window */
+ op = dist - op; /* distance back in window */
+ if (op > whave) {
+ strm->msg = (char *)"invalid distance too far back";
+ state->mode = BAD;
+ break;
+ }
+ from = window - OFF;
+ if (write == 0) { /* very common case */
+ from += wsize - op;
+ if (op < len) { /* some from window */
+ len -= op;
+ do {
+ PUP(out) = PUP(from);
+ } while (--op);
+ from = out - dist; /* rest from output */
+ }
+ }
+ else if (write < op) { /* wrap around window */
+ from += wsize + write - op;
+ op -= write;
+ if (op < len) { /* some from end of window */
+ len -= op;
+ do {
+ PUP(out) = PUP(from);
+ } while (--op);
+ from = window - OFF;
+ if (write < len) { /* some from start of window */
+ op = write;
+ len -= op;
+ do {
+ PUP(out) = PUP(from);
+ } while (--op);
+ from = out - dist; /* rest from output */
+ }
+ }
+ }
+ else { /* contiguous in window */
+ from += write - op;
+ if (op < len) { /* some from window */
+ len -= op;
+ do {
+ PUP(out) = PUP(from);
+ } while (--op);
+ from = out - dist; /* rest from output */
+ }
+ }
+ while (len > 2) {
+ PUP(out) = PUP(from);
+ PUP(out) = PUP(from);
+ PUP(out) = PUP(from);
+ len -= 3;
+ }
+ if (len) {
+ PUP(out) = PUP(from);
+ if (len > 1)
+ PUP(out) = PUP(from);
+ }
+ }
+ else {
+ from = out - dist; /* copy direct from output */
+ do { /* minimum length is three */
+ PUP(out) = PUP(from);
+ PUP(out) = PUP(from);
+ PUP(out) = PUP(from);
+ len -= 3;
+ } while (len > 2);
+ if (len) {
+ PUP(out) = PUP(from);
+ if (len > 1)
+ PUP(out) = PUP(from);
+ }
+ }
+ }
+ else if ((op & 64) == 0) { /* 2nd level distance code */
+ this = dcode[this.val + (hold & ((1U << op) - 1))];
+ goto dodist;
+ }
+ else {
+ strm->msg = (char *)"invalid distance code";
+ state->mode = BAD;
+ break;
+ }
+ }
+ else if ((op & 64) == 0) { /* 2nd level length code */
+ this = lcode[this.val + (hold & ((1U << op) - 1))];
+ goto dolen;
+ }
+ else if (op & 32) { /* end-of-block */
+ Tracevv((stderr, "inflate: end of block\n"));
+ state->mode = TYPE;
+ break;
+ }
+ else {
+ strm->msg = (char *)"invalid literal/length code";
+ state->mode = BAD;
+ break;
+ }
+ } while (in < last && out < end);
+
+ /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+ len = bits >> 3;
+ in -= len;
+ bits -= len << 3;
+ hold &= (1U << bits) - 1;
+
+ /* update state and return */
+ strm->next_in = in + OFF;
+ strm->next_out = out + OFF;
+ strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+ strm->avail_out = (unsigned)(out < end ?
+ 257 + (end - out) : 257 - (out - end));
+ state->hold = hold;
+ state->bits = bits;
+ return;
+}
+
+/*
+ inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+ - Using bit fields for code structure
+ - Different op definition to avoid & for extra bits (do & for table bits)
+ - Three separate decoding do-loops for direct, window, and write == 0
+ - Special case for distance > 1 copies to do overlapped load and store copy
+ - Explicit branch predictions (based on measured branch probabilities)
+ - Deferring match copy and interspersed it with decoding subsequent codes
+ - Swapping literal/length else
+ - Swapping window/direct else
+ - Larger unrolled copy loops (three is about right)
+ - Moving len -= 3 statement into middle of loop
+ */
+
+#endif /* !ASMINF */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffast.h b/sys/contrib/opensolaris/uts/common/zmod/inffast.h
new file mode 100644
index 000000000000..2d214efa299a
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inffast.h
@@ -0,0 +1,13 @@
+/* inffast.h -- header to use inffast.c
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/* WARNING: this file should *not* be used by applications. It is
+ part of the implementation of the compression library and is
+ subject to change. Applications should only use zlib.h.
+ */
+
+void inflate_fast OF((z_streamp strm, unsigned start));
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffixed.h b/sys/contrib/opensolaris/uts/common/zmod/inffixed.h
new file mode 100644
index 000000000000..ed55df899109
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inffixed.h
@@ -0,0 +1,96 @@
+ /* inffixed.h -- table for decoding fixed codes
+ * Generated automatically by makefixed().
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+ /* WARNING: this file should *not* be used by applications. It
+ is part of the implementation of the compression library and
+ is subject to change. Applications should only use zlib.h.
+ */
+
+ static const code lenfix[512] = {
+ {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
+ {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
+ {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
+ {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
+ {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
+ {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
+ {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
+ {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
+ {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
+ {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
+ {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
+ {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
+ {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
+ {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
+ {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
+ {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
+ {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
+ {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
+ {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
+ {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
+ {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
+ {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
+ {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
+ {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
+ {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
+ {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
+ {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
+ {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
+ {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
+ {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
+ {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
+ {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
+ {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
+ {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
+ {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
+ {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
+ {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
+ {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
+ {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
+ {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
+ {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
+ {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
+ {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
+ {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
+ {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
+ {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
+ {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
+ {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
+ {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
+ {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
+ {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
+ {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
+ {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
+ {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
+ {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
+ {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
+ {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
+ {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
+ {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
+ {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
+ {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
+ {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
+ {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
+ {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
+ {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
+ {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
+ {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
+ {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
+ {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
+ {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
+ {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
+ {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
+ {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
+ {0,9,255}
+ };
+
+ static const code distfix[32] = {
+ {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
+ {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
+ {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
+ {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
+ {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
+ {22,5,193},{64,5,0}
+ };
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inflate.c b/sys/contrib/opensolaris/uts/common/zmod/inflate.c
new file mode 100644
index 000000000000..023e7a121ce7
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inflate.c
@@ -0,0 +1,1395 @@
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* inflate.c -- zlib decompression
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Change history:
+ *
+ * 1.2.beta0 24 Nov 2002
+ * - First version -- complete rewrite of inflate to simplify code, avoid
+ * creation of window when not needed, minimize use of window when it is
+ * needed, make inffast.c even faster, implement gzip decoding, and to
+ * improve code readability and style over the previous zlib inflate code
+ *
+ * 1.2.beta1 25 Nov 2002
+ * - Use pointers for available input and output checking in inffast.c
+ * - Remove input and output counters in inffast.c
+ * - Change inffast.c entry and loop from avail_in >= 7 to >= 6
+ * - Remove unnecessary second byte pull from length extra in inffast.c
+ * - Unroll direct copy to three copies per loop in inffast.c
+ *
+ * 1.2.beta2 4 Dec 2002
+ * - Change external routine names to reduce potential conflicts
+ * - Correct filename to inffixed.h for fixed tables in inflate.c
+ * - Make hbuf[] unsigned char to match parameter type in inflate.c
+ * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset)
+ * to avoid negation problem on Alphas (64 bit) in inflate.c
+ *
+ * 1.2.beta3 22 Dec 2002
+ * - Add comments on state->bits assertion in inffast.c
+ * - Add comments on op field in inftrees.h
+ * - Fix bug in reuse of allocated window after inflateReset()
+ * - Remove bit fields--back to byte structure for speed
+ * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths
+ * - Change post-increments to pre-increments in inflate_fast(), PPC biased?
+ * - Add compile time option, POSTINC, to use post-increments instead (Intel?)
+ * - Make MATCH copy in inflate() much faster for when inflate_fast() not used
+ * - Use local copies of stream next and avail values, as well as local bit
+ * buffer and bit count in inflate()--for speed when inflate_fast() not used
+ *
+ * 1.2.beta4 1 Jan 2003
+ * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings
+ * - Move a comment on output buffer sizes from inffast.c to inflate.c
+ * - Add comments in inffast.c to introduce the inflate_fast() routine
+ * - Rearrange window copies in inflate_fast() for speed and simplification
+ * - Unroll last copy for window match in inflate_fast()
+ * - Use local copies of window variables in inflate_fast() for speed
+ * - Pull out common write == 0 case for speed in inflate_fast()
+ * - Make op and len in inflate_fast() unsigned for consistency
+ * - Add FAR to lcode and dcode declarations in inflate_fast()
+ * - Simplified bad distance check in inflate_fast()
+ * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new
+ * source file infback.c to provide a call-back interface to inflate for
+ * programs like gzip and unzip -- uses window as output buffer to avoid
+ * window copying
+ *
+ * 1.2.beta5 1 Jan 2003
+ * - Improved inflateBack() interface to allow the caller to provide initial
+ * input in strm.
+ * - Fixed stored blocks bug in inflateBack()
+ *
+ * 1.2.beta6 4 Jan 2003
+ * - Added comments in inffast.c on effectiveness of POSTINC
+ * - Typecasting all around to reduce compiler warnings
+ * - Changed loops from while (1) or do {} while (1) to for (;;), again to
+ * make compilers happy
+ * - Changed type of window in inflateBackInit() to unsigned char *
+ *
+ * 1.2.beta7 27 Jan 2003
+ * - Changed many types to unsigned or unsigned short to avoid warnings
+ * - Added inflateCopy() function
+ *
+ * 1.2.0 9 Mar 2003
+ * - Changed inflateBack() interface to provide separate opaque descriptors
+ * for the in() and out() functions
+ * - Changed inflateBack() argument and in_func typedef to swap the length
+ * and buffer address return values for the input function
+ * - Check next_in and next_out for Z_NULL on entry to inflate()
+ *
+ * The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
+ */
+
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+
+#ifdef MAKEFIXED
+# ifndef BUILDFIXED
+# define BUILDFIXED
+# endif
+#endif
+
+/* function prototypes */
+local void fixedtables OF((struct inflate_state FAR *state));
+local int updatewindow OF((z_streamp strm, unsigned out));
+#ifdef BUILDFIXED
+ void makefixed OF((void));
+#endif
+local unsigned syncsearch OF((unsigned FAR *have, unsigned char FAR *buf,
+ unsigned len));
+
+int ZEXPORT inflateReset(strm)
+z_streamp strm;
+{
+ struct inflate_state FAR *state;
+
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ strm->total_in = strm->total_out = state->total = 0;
+ strm->msg = Z_NULL;
+ strm->adler = 1; /* to support ill-conceived Java test suite */
+ state->mode = HEAD;
+ state->last = 0;
+ state->havedict = 0;
+ state->dmax = 32768U;
+ state->head = Z_NULL;
+ state->wsize = 0;
+ state->whave = 0;
+ state->write = 0;
+ state->hold = 0;
+ state->bits = 0;
+ state->lencode = state->distcode = state->next = state->codes;
+ Tracev((stderr, "inflate: reset\n"));
+ return Z_OK;
+}
+
+int ZEXPORT inflatePrime(strm, bits, value)
+z_streamp strm;
+int bits;
+int value;
+{
+ struct inflate_state FAR *state;
+
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR;
+ value &= (1L << bits) - 1;
+ state->hold += value << state->bits;
+ state->bits += bits;
+ return Z_OK;
+}
+
+int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size)
+z_streamp strm;
+int windowBits;
+const char *version;
+int stream_size;
+{
+ struct inflate_state FAR *state;
+
+ if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
+ stream_size != (int)(sizeof(z_stream)))
+ return Z_VERSION_ERROR;
+ if (strm == Z_NULL) return Z_STREAM_ERROR;
+ strm->msg = Z_NULL; /* in case we return an error */
+ if (strm->zalloc == (alloc_func)0) {
+ strm->zalloc = zcalloc;
+ strm->opaque = (voidpf)0;
+ }
+ if (strm->zfree == (free_func)0) strm->zfree = zcfree;
+ state = (struct inflate_state FAR *)
+ ZALLOC(strm, 1, sizeof(struct inflate_state));
+ if (state == Z_NULL) return Z_MEM_ERROR;
+ Tracev((stderr, "inflate: allocated\n"));
+ strm->state = (struct internal_state FAR *)state;
+ if (windowBits < 0) {
+ state->wrap = 0;
+ windowBits = -windowBits;
+ }
+ else {
+ state->wrap = (windowBits >> 4) + 1;
+#ifdef GUNZIP
+ if (windowBits < 48) windowBits &= 15;
+#endif
+ }
+ if (windowBits < 8 || windowBits > 15) {
+ ZFREE(strm, state);
+ strm->state = Z_NULL;
+ return Z_STREAM_ERROR;
+ }
+ state->wbits = (unsigned)windowBits;
+ state->window = Z_NULL;
+ return inflateReset(strm);
+}
+
+int ZEXPORT inflateInit_(strm, version, stream_size)
+z_streamp strm;
+const char *version;
+int stream_size;
+{
+ return inflateInit2_(strm, DEF_WBITS, version, stream_size);
+}
+
+/*
+ Return state with length and distance decoding tables and index sizes set to
+ fixed code decoding. Normally this returns fixed tables from inffixed.h.
+ If BUILDFIXED is defined, then instead this routine builds the tables the
+ first time it's called, and returns those tables the first time and
+ thereafter. This reduces the size of the code by about 2K bytes, in
+ exchange for a little execution time. However, BUILDFIXED should not be
+ used for threaded applications, since the rewriting of the tables and virgin
+ may not be thread-safe.
+ */
+local void fixedtables(state)
+struct inflate_state FAR *state;
+{
+#ifdef BUILDFIXED
+ static int virgin = 1;
+ static code *lenfix, *distfix;
+ static code fixed[544];
+
+ /* build fixed huffman tables if first call (may not be thread safe) */
+ if (virgin) {
+ unsigned sym, bits;
+ static code *next;
+
+ /* literal/length table */
+ sym = 0;
+ while (sym < 144) state->lens[sym++] = 8;
+ while (sym < 256) state->lens[sym++] = 9;
+ while (sym < 280) state->lens[sym++] = 7;
+ while (sym < 288) state->lens[sym++] = 8;
+ next = fixed;
+ lenfix = next;
+ bits = 9;
+ inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work);
+
+ /* distance table */
+ sym = 0;
+ while (sym < 32) state->lens[sym++] = 5;
+ distfix = next;
+ bits = 5;
+ inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work);
+
+ /* do this just once */
+ virgin = 0;
+ }
+#else /* !BUILDFIXED */
+# include "inffixed.h"
+#endif /* BUILDFIXED */
+ state->lencode = lenfix;
+ state->lenbits = 9;
+ state->distcode = distfix;
+ state->distbits = 5;
+}
+
+#ifdef MAKEFIXED
+#include <stdio.h>
+
+/*
+ Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also
+ defines BUILDFIXED, so the tables are built on the fly. makefixed() writes
+ those tables to stdout, which would be piped to inffixed.h. A small program
+ can simply call makefixed to do this:
+
+ void makefixed(void);
+
+ int main(void)
+ {
+ makefixed();
+ return 0;
+ }
+
+ Then that can be linked with zlib built with MAKEFIXED defined and run:
+
+ a.out > inffixed.h
+ */
+void makefixed()
+{
+ unsigned low, size;
+ struct inflate_state state;
+
+ fixedtables(&state);
+ puts(" /* inffixed.h -- table for decoding fixed codes");
+ puts(" * Generated automatically by makefixed().");
+ puts(" */");
+ puts("");
+ puts(" /* WARNING: this file should *not* be used by applications.");
+ puts(" It is part of the implementation of this library and is");
+ puts(" subject to change. Applications should only use zlib.h.");
+ puts(" */");
+ puts("");
+ size = 1U << 9;
+ printf(" static const code lenfix[%u] = {", size);
+ low = 0;
+ for (;;) {
+ if ((low % 7) == 0) printf("\n ");
+ printf("{%u,%u,%d}", state.lencode[low].op, state.lencode[low].bits,
+ state.lencode[low].val);
+ if (++low == size) break;
+ putchar(',');
+ }
+ puts("\n };");
+ size = 1U << 5;
+ printf("\n static const code distfix[%u] = {", size);
+ low = 0;
+ for (;;) {
+ if ((low % 6) == 0) printf("\n ");
+ printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits,
+ state.distcode[low].val);
+ if (++low == size) break;
+ putchar(',');
+ }
+ puts("\n };");
+}
+#endif /* MAKEFIXED */
+
+/*
+ Update the window with the last wsize (normally 32K) bytes written before
+ returning. If window does not exist yet, create it. This is only called
+ when a window is already in use, or when output has been written during this
+ inflate call, but the end of the deflate stream has not been reached yet.
+ It is also called to create a window for dictionary data when a dictionary
+ is loaded.
+
+ Providing output buffers larger than 32K to inflate() should provide a speed
+ advantage, since only the last 32K of output is copied to the sliding window
+ upon return from inflate(), and since all distances after the first 32K of
+ output will fall in the output data, making match copies simpler and faster.
+ The advantage may be dependent on the size of the processor's data caches.
+ */
+local int updatewindow(strm, out)
+z_streamp strm;
+unsigned out;
+{
+ struct inflate_state FAR *state;
+ unsigned copy, dist;
+
+ state = (struct inflate_state FAR *)strm->state;
+
+ /* if it hasn't been done already, allocate space for the window */
+ if (state->window == Z_NULL) {
+ state->window = (unsigned char FAR *)
+ ZALLOC(strm, 1U << state->wbits,
+ sizeof(unsigned char));
+ if (state->window == Z_NULL) return 1;
+ }
+
+ /* if window not in use yet, initialize */
+ if (state->wsize == 0) {
+ state->wsize = 1U << state->wbits;
+ state->write = 0;
+ state->whave = 0;
+ }
+
+ /* copy state->wsize or less output bytes into the circular window */
+ copy = out - strm->avail_out;
+ if (copy >= state->wsize) {
+ zmemcpy(state->window, strm->next_out - state->wsize, state->wsize);
+ state->write = 0;
+ state->whave = state->wsize;
+ }
+ else {
+ dist = state->wsize - state->write;
+ if (dist > copy) dist = copy;
+ zmemcpy(state->window + state->write, strm->next_out - copy, dist);
+ copy -= dist;
+ if (copy) {
+ zmemcpy(state->window, strm->next_out - copy, copy);
+ state->write = copy;
+ state->whave = state->wsize;
+ }
+ else {
+ state->write += dist;
+ if (state->write == state->wsize) state->write = 0;
+ if (state->whave < state->wsize) state->whave += dist;
+ }
+ }
+ return 0;
+}
+
+/* Macros for inflate(): */
+
+/* check function to use adler32() for zlib or crc32() for gzip */
+#ifdef GUNZIP
+# define UPDATE(check, buf, len) \
+ (state->flags ? crc32(check, buf, len) : adler32(check, buf, len))
+#else
+# define UPDATE(check, buf, len) adler32(check, buf, len)
+#endif
+
+/* check macros for header crc */
+#ifdef GUNZIP
+# define CRC2(check, word) \
+ do { \
+ hbuf[0] = (unsigned char)(word); \
+ hbuf[1] = (unsigned char)((word) >> 8); \
+ check = crc32(check, hbuf, 2); \
+ } while (0)
+
+# define CRC4(check, word) \
+ do { \
+ hbuf[0] = (unsigned char)(word); \
+ hbuf[1] = (unsigned char)((word) >> 8); \
+ hbuf[2] = (unsigned char)((word) >> 16); \
+ hbuf[3] = (unsigned char)((word) >> 24); \
+ check = crc32(check, hbuf, 4); \
+ } while (0)
+#endif
+
+/* Load registers with state in inflate() for speed */
+#define LOAD() \
+ do { \
+ put = strm->next_out; \
+ left = strm->avail_out; \
+ next = strm->next_in; \
+ have = strm->avail_in; \
+ hold = state->hold; \
+ bits = state->bits; \
+ } while (0)
+
+/* Restore state from registers in inflate() */
+#define RESTORE() \
+ do { \
+ strm->next_out = put; \
+ strm->avail_out = left; \
+ strm->next_in = next; \
+ strm->avail_in = have; \
+ state->hold = hold; \
+ state->bits = bits; \
+ } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+ do { \
+ hold = 0; \
+ bits = 0; \
+ } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflate()
+ if there is no input available. */
+#define PULLBYTE() \
+ do { \
+ if (have == 0) goto inf_leave; \
+ have--; \
+ hold += (unsigned long)(*next++) << bits; \
+ bits += 8; \
+ } while (0)
+
+/* Assure that there are at least n bits in the bit accumulator. If there is
+ not enough available input to do that, then return from inflate(). */
+#define NEEDBITS(n) \
+ do { \
+ while (bits < (unsigned)(n)) \
+ PULLBYTE(); \
+ } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+ ((unsigned)hold & ((1U << (n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+ do { \
+ hold >>= (n); \
+ bits -= (unsigned)(n); \
+ } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+ do { \
+ hold >>= bits & 7; \
+ bits -= bits & 7; \
+ } while (0)
+
+/* Reverse the bytes in a 32-bit value */
+#define REVERSE(q) \
+ ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+ (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+
+/*
+ inflate() uses a state machine to process as much input data and generate as
+ much output data as possible before returning. The state machine is
+ structured roughly as follows:
+
+ for (;;) switch (state) {
+ ...
+ case STATEn:
+ if (not enough input data or output space to make progress)
+ return;
+ ... make progress ...
+ state = STATEm;
+ break;
+ ...
+ }
+
+ so when inflate() is called again, the same case is attempted again, and
+ if the appropriate resources are provided, the machine proceeds to the
+ next state. The NEEDBITS() macro is usually the way the state evaluates
+ whether it can proceed or should return. NEEDBITS() does the return if
+ the requested bits are not available. The typical use of the BITS macros
+ is:
+
+ NEEDBITS(n);
+ ... do something with BITS(n) ...
+ DROPBITS(n);
+
+ where NEEDBITS(n) either returns from inflate() if there isn't enough
+ input left to load n bits into the accumulator, or it continues. BITS(n)
+ gives the low n bits in the accumulator. When done, DROPBITS(n) drops
+ the low n bits off the accumulator. INITBITS() clears the accumulator
+ and sets the number of available bits to zero. BYTEBITS() discards just
+ enough bits to put the accumulator on a byte boundary. After BYTEBITS()
+ and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
+
+ NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
+ if there is no input available. The decoding of variable length codes uses
+ PULLBYTE() directly in order to pull just enough bytes to decode the next
+ code, and no more.
+
+ Some states loop until they get enough input, making sure that enough
+ state information is maintained to continue the loop where it left off
+ if NEEDBITS() returns in the loop. For example, want, need, and keep
+ would all have to actually be part of the saved state in case NEEDBITS()
+ returns:
+
+ case STATEw:
+ while (want < need) {
+ NEEDBITS(n);
+ keep[want++] = BITS(n);
+ DROPBITS(n);
+ }
+ state = STATEx;
+ case STATEx:
+
+ As shown above, if the next state is also the next case, then the break
+ is omitted.
+
+ A state may also return if there is not enough output space available to
+ complete that state. Those states are copying stored data, writing a
+ literal byte, and copying a matching string.
+
+ When returning, a "goto inf_leave" is used to update the total counters,
+ update the check value, and determine whether any progress has been made
+ during that inflate() call in order to return the proper return code.
+ Progress is defined as a change in either strm->avail_in or strm->avail_out.
+ When there is a window, goto inf_leave will update the window with the last
+ output written. If a goto inf_leave occurs in the middle of decompression
+ and there is no window currently, goto inf_leave will create one and copy
+ output to the window for the next call of inflate().
+
+ In this implementation, the flush parameter of inflate() only affects the
+ return code (per zlib.h). inflate() always writes as much as possible to
+ strm->next_out, given the space available and the provided input--the effect
+ documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers
+ the allocation of and copying into a sliding window until necessary, which
+ provides the effect documented in zlib.h for Z_FINISH when the entire input
+ stream available. So the only thing the flush parameter actually does is:
+ when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it
+ will return Z_BUF_ERROR if it has not reached the end of the stream.
+ */
+
+int ZEXPORT inflate(strm, flush)
+z_streamp strm;
+int flush;
+{
+ struct inflate_state FAR *state;
+ unsigned char FAR *next; /* next input */
+ unsigned char FAR *put; /* next output */
+ unsigned have, left; /* available input and output */
+ unsigned long hold; /* bit buffer */
+ unsigned bits; /* bits in bit buffer */
+ unsigned in, out; /* save starting available input and output */
+ unsigned copy; /* number of stored or match bytes to copy */
+ unsigned char FAR *from; /* where to copy match bytes from */
+ code this; /* current decoding table entry */
+ code last; /* parent table entry */
+ unsigned len; /* length to copy for repeats, bits to drop */
+ int ret; /* return code */
+#ifdef GUNZIP
+ unsigned char hbuf[4]; /* buffer for gzip header crc calculation */
+#endif
+ static const unsigned short order[19] = /* permutation of code lengths */
+ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+ if (strm == Z_NULL || strm->state == Z_NULL || strm->next_out == Z_NULL ||
+ (strm->next_in == Z_NULL && strm->avail_in != 0))
+ return Z_STREAM_ERROR;
+
+ state = (struct inflate_state FAR *)strm->state;
+ if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */
+ LOAD();
+ in = have;
+ out = left;
+ ret = Z_OK;
+ for (;;)
+ switch (state->mode) {
+ case HEAD:
+ if (state->wrap == 0) {
+ state->mode = TYPEDO;
+ break;
+ }
+ NEEDBITS(16);
+#ifdef GUNZIP
+ if ((state->wrap & 2) && hold == 0x8b1f) { /* gzip header */
+ state->check = crc32(0L, Z_NULL, 0);
+ CRC2(state->check, hold);
+ INITBITS();
+ state->mode = FLAGS;
+ break;
+ }
+ state->flags = 0; /* expect zlib header */
+ if (state->head != Z_NULL)
+ state->head->done = -1;
+ if (!(state->wrap & 1) || /* check if zlib header allowed */
+#else
+ if (
+#endif
+ ((BITS(8) << 8) + (hold >> 8)) % 31) {
+ strm->msg = (char *)"incorrect header check";
+ state->mode = BAD;
+ break;
+ }
+ if (BITS(4) != Z_DEFLATED) {
+ strm->msg = (char *)"unknown compression method";
+ state->mode = BAD;
+ break;
+ }
+ DROPBITS(4);
+ len = BITS(4) + 8;
+ if (len > state->wbits) {
+ strm->msg = (char *)"invalid window size";
+ state->mode = BAD;
+ break;
+ }
+ state->dmax = 1U << len;
+ Tracev((stderr, "inflate: zlib header ok\n"));
+ strm->adler = state->check = adler32(0L, Z_NULL, 0);
+ state->mode = hold & 0x200 ? DICTID : TYPE;
+ INITBITS();
+ break;
+#ifdef GUNZIP
+ case FLAGS:
+ NEEDBITS(16);
+ state->flags = (int)(hold);
+ if ((state->flags & 0xff) != Z_DEFLATED) {
+ strm->msg = (char *)"unknown compression method";
+ state->mode = BAD;
+ break;
+ }
+ if (state->flags & 0xe000) {
+ strm->msg = (char *)"unknown header flags set";
+ state->mode = BAD;
+ break;
+ }
+ if (state->head != Z_NULL)
+ state->head->text = (int)((hold >> 8) & 1);
+ if (state->flags & 0x0200) CRC2(state->check, hold);
+ INITBITS();
+ state->mode = TIME;
+ /*FALLTHRU*/
+ case TIME:
+ NEEDBITS(32);
+ if (state->head != Z_NULL)
+ state->head->time = hold;
+ if (state->flags & 0x0200) CRC4(state->check, hold);
+ INITBITS();
+ state->mode = OS;
+ /*FALLTHRU*/
+ case OS:
+ NEEDBITS(16);
+ if (state->head != Z_NULL) {
+ state->head->xflags = (int)(hold & 0xff);
+ state->head->os = (int)(hold >> 8);
+ }
+ if (state->flags & 0x0200) CRC2(state->check, hold);
+ INITBITS();
+ state->mode = EXLEN;
+ /*FALLTHRU*/
+ case EXLEN:
+ if (state->flags & 0x0400) {
+ NEEDBITS(16);
+ state->length = (unsigned)(hold);
+ if (state->head != Z_NULL)
+ state->head->extra_len = (unsigned)hold;
+ if (state->flags & 0x0200) CRC2(state->check, hold);
+ INITBITS();
+ }
+ else if (state->head != Z_NULL)
+ state->head->extra = Z_NULL;
+ state->mode = EXTRA;
+ /*FALLTHRU*/
+ case EXTRA:
+ if (state->flags & 0x0400) {
+ copy = state->length;
+ if (copy > have) copy = have;
+ if (copy) {
+ if (state->head != Z_NULL &&
+ state->head->extra != Z_NULL) {
+ len = state->head->extra_len - state->length;
+ zmemcpy(state->head->extra + len, next,
+ len + copy > state->head->extra_max ?
+ state->head->extra_max - len : copy);
+ }
+ if (state->flags & 0x0200)
+ state->check = crc32(state->check, next, copy);
+ have -= copy;
+ next += copy;
+ state->length -= copy;
+ }
+ if (state->length) goto inf_leave;
+ }
+ state->length = 0;
+ state->mode = NAME;
+ /*FALLTHRU*/
+ case NAME:
+ if (state->flags & 0x0800) {
+ if (have == 0) goto inf_leave;
+ copy = 0;
+ do {
+ len = (unsigned)(next[copy++]);
+ if (state->head != Z_NULL &&
+ state->head->name != Z_NULL &&
+ state->length < state->head->name_max)
+ state->head->name[state->length++] = len;
+ } while (len && copy < have);
+ if (state->flags & 0x0200)
+ state->check = crc32(state->check, next, copy);
+ have -= copy;
+ next += copy;
+ if (len) goto inf_leave;
+ }
+ else if (state->head != Z_NULL)
+ state->head->name = Z_NULL;
+ state->length = 0;
+ state->mode = COMMENT;
+ /*FALLTHRU*/
+ case COMMENT:
+ if (state->flags & 0x1000) {
+ if (have == 0) goto inf_leave;
+ copy = 0;
+ do {
+ len = (unsigned)(next[copy++]);
+ if (state->head != Z_NULL &&
+ state->head->comment != Z_NULL &&
+ state->length < state->head->comm_max)
+ state->head->comment[state->length++] = len;
+ } while (len && copy < have);
+ if (state->flags & 0x0200)
+ state->check = crc32(state->check, next, copy);
+ have -= copy;
+ next += copy;
+ if (len) goto inf_leave;
+ }
+ else if (state->head != Z_NULL)
+ state->head->comment = Z_NULL;
+ state->mode = HCRC;
+ /*FALLTHRU*/
+ case HCRC:
+ if (state->flags & 0x0200) {
+ NEEDBITS(16);
+ if (hold != (state->check & 0xffff)) {
+ strm->msg = (char *)"header crc mismatch";
+ state->mode = BAD;
+ break;
+ }
+ INITBITS();
+ }
+ if (state->head != Z_NULL) {
+ state->head->hcrc = (int)((state->flags >> 9) & 1);
+ state->head->done = 1;
+ }
+ strm->adler = state->check = crc32(0L, Z_NULL, 0);
+ state->mode = TYPE;
+ break;
+#endif
+ case DICTID:
+ NEEDBITS(32);
+ strm->adler = state->check = REVERSE(hold);
+ INITBITS();
+ state->mode = DICT;
+ /*FALLTHRU*/
+ case DICT:
+ if (state->havedict == 0) {
+ RESTORE();
+ return Z_NEED_DICT;
+ }
+ strm->adler = state->check = adler32(0L, Z_NULL, 0);
+ state->mode = TYPE;
+ /*FALLTHRU*/
+ case TYPE:
+ if (flush == Z_BLOCK) goto inf_leave;
+ /*FALLTHRU*/
+ case TYPEDO:
+ if (state->last) {
+ BYTEBITS();
+ state->mode = CHECK;
+ break;
+ }
+ NEEDBITS(3);
+ state->last = BITS(1);
+ DROPBITS(1);
+ switch (BITS(2)) {
+ case 0: /* stored block */
+ Tracev((stderr, "inflate: stored block%s\n",
+ state->last ? " (last)" : ""));
+ state->mode = STORED;
+ break;
+ case 1: /* fixed block */
+ fixedtables(state);
+ Tracev((stderr, "inflate: fixed codes block%s\n",
+ state->last ? " (last)" : ""));
+ state->mode = LEN; /* decode codes */
+ break;
+ case 2: /* dynamic block */
+ Tracev((stderr, "inflate: dynamic codes block%s\n",
+ state->last ? " (last)" : ""));
+ state->mode = TABLE;
+ break;
+ case 3:
+ strm->msg = (char *)"invalid block type";
+ state->mode = BAD;
+ }
+ DROPBITS(2);
+ break;
+ case STORED:
+ BYTEBITS(); /* go to byte boundary */
+ NEEDBITS(32);
+ if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+ strm->msg = (char *)"invalid stored block lengths";
+ state->mode = BAD;
+ break;
+ }
+ state->length = (unsigned)hold & 0xffff;
+ Tracev((stderr, "inflate: stored length %u\n",
+ state->length));
+ INITBITS();
+ state->mode = COPY;
+ /*FALLTHRU*/
+ case COPY:
+ copy = state->length;
+ if (copy) {
+ if (copy > have) copy = have;
+ if (copy > left) copy = left;
+ if (copy == 0) goto inf_leave;
+ zmemcpy(put, next, copy);
+ have -= copy;
+ next += copy;
+ left -= copy;
+ put += copy;
+ state->length -= copy;
+ break;
+ }
+ Tracev((stderr, "inflate: stored end\n"));
+ state->mode = TYPE;
+ break;
+ case TABLE:
+ NEEDBITS(14);
+ state->nlen = BITS(5) + 257;
+ DROPBITS(5);
+ state->ndist = BITS(5) + 1;
+ DROPBITS(5);
+ state->ncode = BITS(4) + 4;
+ DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+ if (state->nlen > 286 || state->ndist > 30) {
+ strm->msg = (char *)"too many length or distance symbols";
+ state->mode = BAD;
+ break;
+ }
+#endif
+ Tracev((stderr, "inflate: table sizes ok\n"));
+ state->have = 0;
+ state->mode = LENLENS;
+ /*FALLTHRU*/
+ case LENLENS:
+ while (state->have < state->ncode) {
+ NEEDBITS(3);
+ state->lens[order[state->have++]] = (unsigned short)BITS(3);
+ DROPBITS(3);
+ }
+ while (state->have < 19)
+ state->lens[order[state->have++]] = 0;
+ state->next = state->codes;
+ state->lencode = (code const FAR *)(state->next);
+ state->lenbits = 7;
+ ret = inflate_table(CODES, state->lens, 19, &(state->next),
+ &(state->lenbits), state->work);
+ if (ret) {
+ strm->msg = (char *)"invalid code lengths set";
+ state->mode = BAD;
+ break;
+ }
+ Tracev((stderr, "inflate: code lengths ok\n"));
+ state->have = 0;
+ state->mode = CODELENS;
+ /*FALLTHRU*/
+ case CODELENS:
+ while (state->have < state->nlen + state->ndist) {
+ for (;;) {
+ this = state->lencode[BITS(state->lenbits)];
+ if ((unsigned)(this.bits) <= bits) break;
+ PULLBYTE();
+ }
+ if (this.val < 16) {
+ NEEDBITS(this.bits);
+ DROPBITS(this.bits);
+ state->lens[state->have++] = this.val;
+ }
+ else {
+ if (this.val == 16) {
+ NEEDBITS(this.bits + 2);
+ DROPBITS(this.bits);
+ if (state->have == 0) {
+ strm->msg = (char *)"invalid bit length repeat";
+ state->mode = BAD;
+ break;
+ }
+ len = state->lens[state->have - 1];
+ copy = 3 + BITS(2);
+ DROPBITS(2);
+ }
+ else if (this.val == 17) {
+ NEEDBITS(this.bits + 3);
+ DROPBITS(this.bits);
+ len = 0;
+ copy = 3 + BITS(3);
+ DROPBITS(3);
+ }
+ else {
+ NEEDBITS(this.bits + 7);
+ DROPBITS(this.bits);
+ len = 0;
+ copy = 11 + BITS(7);
+ DROPBITS(7);
+ }
+ if (state->have + copy > state->nlen + state->ndist) {
+ strm->msg = (char *)"invalid bit length repeat";
+ state->mode = BAD;
+ break;
+ }
+ while (copy--)
+ state->lens[state->have++] = (unsigned short)len;
+ }
+ }
+
+ /* handle error breaks in while */
+ if (state->mode == BAD) break;
+
+ /* build code tables */
+ state->next = state->codes;
+ state->lencode = (code const FAR *)(state->next);
+ state->lenbits = 9;
+ ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
+ &(state->lenbits), state->work);
+ if (ret) {
+ strm->msg = (char *)"invalid literal/lengths set";
+ state->mode = BAD;
+ break;
+ }
+ state->distcode = (code const FAR *)(state->next);
+ state->distbits = 6;
+ ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+ &(state->next), &(state->distbits), state->work);
+ if (ret) {
+ strm->msg = (char *)"invalid distances set";
+ state->mode = BAD;
+ break;
+ }
+ Tracev((stderr, "inflate: codes ok\n"));
+ state->mode = LEN;
+ /*FALLTHRU*/
+ case LEN:
+ if (have >= 6 && left >= 258) {
+ RESTORE();
+ inflate_fast(strm, out);
+ LOAD();
+ break;
+ }
+ for (;;) {
+ this = state->lencode[BITS(state->lenbits)];
+ if ((unsigned)(this.bits) <= bits) break;
+ PULLBYTE();
+ }
+ if (this.op && (this.op & 0xf0) == 0) {
+ last = this;
+ for (;;) {
+ this = state->lencode[last.val +
+ (BITS(last.bits + last.op) >> last.bits)];
+ if ((unsigned)(last.bits + this.bits) <= bits) break;
+ PULLBYTE();
+ }
+ DROPBITS(last.bits);
+ }
+ DROPBITS(this.bits);
+ state->length = (unsigned)this.val;
+ if ((int)(this.op) == 0) {
+ Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
+ "inflate: literal '%c'\n" :
+ "inflate: literal 0x%02x\n", this.val));
+ state->mode = LIT;
+ break;
+ }
+ if (this.op & 32) {
+ Tracevv((stderr, "inflate: end of block\n"));
+ state->mode = TYPE;
+ break;
+ }
+ if (this.op & 64) {
+ strm->msg = (char *)"invalid literal/length code";
+ state->mode = BAD;
+ break;
+ }
+ state->extra = (unsigned)(this.op) & 15;
+ state->mode = LENEXT;
+ /*FALLTHRU*/
+ case LENEXT:
+ if (state->extra) {
+ NEEDBITS(state->extra);
+ state->length += BITS(state->extra);
+ DROPBITS(state->extra);
+ }
+ Tracevv((stderr, "inflate: length %u\n", state->length));
+ state->mode = DIST;
+ /*FALLTHRU*/
+ case DIST:
+ for (;;) {
+ this = state->distcode[BITS(state->distbits)];
+ if ((unsigned)(this.bits) <= bits) break;
+ PULLBYTE();
+ }
+ if ((this.op & 0xf0) == 0) {
+ last = this;
+ for (;;) {
+ this = state->distcode[last.val +
+ (BITS(last.bits + last.op) >> last.bits)];
+ if ((unsigned)(last.bits + this.bits) <= bits) break;
+ PULLBYTE();
+ }
+ DROPBITS(last.bits);
+ }
+ DROPBITS(this.bits);
+ if (this.op & 64) {
+ strm->msg = (char *)"invalid distance code";
+ state->mode = BAD;
+ break;
+ }
+ state->offset = (unsigned)this.val;
+ state->extra = (unsigned)(this.op) & 15;
+ state->mode = DISTEXT;
+ /*FALLTHRU*/
+ case DISTEXT:
+ if (state->extra) {
+ NEEDBITS(state->extra);
+ state->offset += BITS(state->extra);
+ DROPBITS(state->extra);
+ }
+#ifdef INFLATE_STRICT
+ if (state->offset > state->dmax) {
+ strm->msg = (char *)"invalid distance too far back";
+ state->mode = BAD;
+ break;
+ }
+#endif
+ if (state->offset > state->whave + out - left) {
+ strm->msg = (char *)"invalid distance too far back";
+ state->mode = BAD;
+ break;
+ }
+ Tracevv((stderr, "inflate: distance %u\n", state->offset));
+ state->mode = MATCH;
+ /*FALLTHRU*/
+ case MATCH:
+ if (left == 0) goto inf_leave;
+ copy = out - left;
+ if (state->offset > copy) { /* copy from window */
+ copy = state->offset - copy;
+ if (copy > state->write) {
+ copy -= state->write;
+ from = state->window + (state->wsize - copy);
+ }
+ else
+ from = state->window + (state->write - copy);
+ if (copy > state->length) copy = state->length;
+ }
+ else { /* copy from output */
+ from = put - state->offset;
+ copy = state->length;
+ }
+ if (copy > left) copy = left;
+ left -= copy;
+ state->length -= copy;
+ do {
+ *put++ = *from++;
+ } while (--copy);
+ if (state->length == 0) state->mode = LEN;
+ break;
+ case LIT:
+ if (left == 0) goto inf_leave;
+ *put++ = (unsigned char)(state->length);
+ left--;
+ state->mode = LEN;
+ break;
+ case CHECK:
+ if (state->wrap) {
+ NEEDBITS(32);
+ out -= left;
+ strm->total_out += out;
+ state->total += out;
+ if (out)
+ strm->adler = state->check =
+ UPDATE(state->check, put - out, out);
+ out = left;
+ if ((
+#ifdef GUNZIP
+ state->flags ? hold :
+#endif
+ REVERSE(hold)) != state->check) {
+ strm->msg = (char *)"incorrect data check";
+ state->mode = BAD;
+ break;
+ }
+ INITBITS();
+ Tracev((stderr, "inflate: check matches trailer\n"));
+ }
+#ifdef GUNZIP
+ state->mode = LENGTH;
+ /*FALLTHRU*/
+ case LENGTH:
+ if (state->wrap && state->flags) {
+ NEEDBITS(32);
+ if (hold != (state->total & 0xffffffffUL)) {
+ strm->msg = (char *)"incorrect length check";
+ state->mode = BAD;
+ break;
+ }
+ INITBITS();
+ Tracev((stderr, "inflate: length matches trailer\n"));
+ }
+#endif
+ state->mode = DONE;
+ /*FALLTHRU*/
+ case DONE:
+ ret = Z_STREAM_END;
+ goto inf_leave;
+ case BAD:
+ ret = Z_DATA_ERROR;
+ goto inf_leave;
+ case MEM:
+ return Z_MEM_ERROR;
+ case SYNC:
+ default:
+ return Z_STREAM_ERROR;
+ }
+
+ /*
+ Return from inflate(), updating the total counts and the check value.
+ If there was no progress during the inflate() call, return a buffer
+ error. Call updatewindow() to create and/or update the window state.
+ Note: a memory error from inflate() is non-recoverable.
+ */
+ inf_leave:
+ RESTORE();
+ if (state->wsize || (state->mode < CHECK && out != strm->avail_out))
+ if (updatewindow(strm, out)) {
+ state->mode = MEM;
+ return Z_MEM_ERROR;
+ }
+ in -= strm->avail_in;
+ out -= strm->avail_out;
+ strm->total_in += in;
+ strm->total_out += out;
+ state->total += out;
+ if (state->wrap && out)
+ strm->adler = state->check =
+ UPDATE(state->check, strm->next_out - out, out);
+ strm->data_type = state->bits + (state->last ? 64 : 0) +
+ (state->mode == TYPE ? 128 : 0);
+ if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
+ ret = Z_BUF_ERROR;
+ return ret;
+}
+
+int ZEXPORT inflateEnd(strm)
+z_streamp strm;
+{
+ struct inflate_state FAR *state;
+ if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0)
+ return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ if (state->window != Z_NULL) ZFREE(strm, state->window);
+ ZFREE(strm, strm->state);
+ strm->state = Z_NULL;
+ Tracev((stderr, "inflate: end\n"));
+ return Z_OK;
+}
+
+int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength)
+z_streamp strm;
+const Bytef *dictionary;
+uInt dictLength;
+{
+ struct inflate_state FAR *state;
+ unsigned long id;
+
+ /* check state */
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ if (state->wrap != 0 && state->mode != DICT)
+ return Z_STREAM_ERROR;
+
+ /* check for correct dictionary id */
+ if (state->mode == DICT) {
+ id = adler32(0L, Z_NULL, 0);
+ id = adler32(id, dictionary, dictLength);
+ if (id != state->check)
+ return Z_DATA_ERROR;
+ }
+
+ /* copy dictionary to window */
+ if (updatewindow(strm, strm->avail_out)) {
+ state->mode = MEM;
+ return Z_MEM_ERROR;
+ }
+ if (dictLength > state->wsize) {
+ zmemcpy(state->window, dictionary + dictLength - state->wsize,
+ state->wsize);
+ state->whave = state->wsize;
+ }
+ else {
+ zmemcpy(state->window + state->wsize - dictLength, dictionary,
+ dictLength);
+ state->whave = dictLength;
+ }
+ state->havedict = 1;
+ Tracev((stderr, "inflate: dictionary set\n"));
+ return Z_OK;
+}
+
+int ZEXPORT inflateGetHeader(strm, head)
+z_streamp strm;
+gz_headerp head;
+{
+ struct inflate_state FAR *state;
+
+ /* check state */
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ if ((state->wrap & 2) == 0) return Z_STREAM_ERROR;
+
+ /* save header structure */
+ state->head = head;
+ head->done = 0;
+ return Z_OK;
+}
+
+/*
+ Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found
+ or when out of input. When called, *have is the number of pattern bytes
+ found in order so far, in 0..3. On return *have is updated to the new
+ state. If on return *have equals four, then the pattern was found and the
+ return value is how many bytes were read including the last byte of the
+ pattern. If *have is less than four, then the pattern has not been found
+ yet and the return value is len. In the latter case, syncsearch() can be
+ called again with more data and the *have state. *have is initialized to
+ zero for the first call.
+ */
+local unsigned syncsearch(have, buf, len)
+unsigned FAR *have;
+unsigned char FAR *buf;
+unsigned len;
+{
+ unsigned got;
+ unsigned next;
+
+ got = *have;
+ next = 0;
+ while (next < len && got < 4) {
+ if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
+ got++;
+ else if (buf[next])
+ got = 0;
+ else
+ got = 4 - got;
+ next++;
+ }
+ *have = got;
+ return next;
+}
+
+int ZEXPORT inflateSync(strm)
+z_streamp strm;
+{
+ unsigned len; /* number of bytes to look at or looked at */
+ unsigned long in, out; /* temporary to save total_in and total_out */
+ unsigned char buf[4]; /* to restore bit buffer to byte string */
+ struct inflate_state FAR *state;
+
+ /* check parameters */
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR;
+
+ /* if first time, start search in bit buffer */
+ if (state->mode != SYNC) {
+ state->mode = SYNC;
+ state->hold <<= state->bits & 7;
+ state->bits -= state->bits & 7;
+ len = 0;
+ while (state->bits >= 8) {
+ buf[len++] = (unsigned char)(state->hold);
+ state->hold >>= 8;
+ state->bits -= 8;
+ }
+ state->have = 0;
+ (void) syncsearch(&(state->have), buf, len);
+ }
+
+ /* search available input */
+ len = syncsearch(&(state->have), strm->next_in, strm->avail_in);
+ strm->avail_in -= len;
+ strm->next_in += len;
+ strm->total_in += len;
+
+ /* return no joy or set up to restart inflate() on a new block */
+ if (state->have != 4) return Z_DATA_ERROR;
+ in = strm->total_in; out = strm->total_out;
+ (void) inflateReset(strm);
+ strm->total_in = in; strm->total_out = out;
+ state->mode = TYPE;
+ return Z_OK;
+}
+
+/*
+ Returns true if inflate is currently at the end of a block generated by
+ Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
+ implementation to provide an additional safety check. PPP uses
+ Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
+ block. When decompressing, PPP checks that at the end of input packet,
+ inflate is waiting for these length bytes.
+ */
+int ZEXPORT inflateSyncPoint(strm)
+z_streamp strm;
+{
+ struct inflate_state FAR *state;
+
+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)strm->state;
+ return state->mode == STORED && state->bits == 0;
+}
+
+int ZEXPORT inflateCopy(dest, source)
+z_streamp dest;
+z_streamp source;
+{
+ struct inflate_state FAR *state;
+ struct inflate_state FAR *copy;
+ unsigned char FAR *window;
+ unsigned wsize;
+
+ /* check input */
+ if (dest == Z_NULL || source == Z_NULL || source->state == Z_NULL ||
+ source->zalloc == (alloc_func)0 || source->zfree == (free_func)0)
+ return Z_STREAM_ERROR;
+ state = (struct inflate_state FAR *)source->state;
+
+ /* allocate space */
+ copy = (struct inflate_state FAR *)
+ ZALLOC(source, 1, sizeof(struct inflate_state));
+ if (copy == Z_NULL) return Z_MEM_ERROR;
+ window = Z_NULL;
+ if (state->window != Z_NULL) {
+ window = (unsigned char FAR *)
+ ZALLOC(source, 1U << state->wbits, sizeof(unsigned char));
+ if (window == Z_NULL) {
+ ZFREE(source, copy);
+ return Z_MEM_ERROR;
+ }
+ }
+
+ /* copy state */
+ zmemcpy(dest, source, sizeof(z_stream));
+ zmemcpy(copy, state, sizeof(struct inflate_state));
+ if (state->lencode >= state->codes &&
+ state->lencode <= state->codes + ENOUGH - 1) {
+ copy->lencode = copy->codes + (state->lencode - state->codes);
+ copy->distcode = copy->codes + (state->distcode - state->codes);
+ }
+ copy->next = copy->codes + (state->next - state->codes);
+ if (window != Z_NULL) {
+ wsize = 1U << state->wbits;
+ zmemcpy(window, state->window, wsize);
+ }
+ copy->window = window;
+ dest->state = (struct internal_state FAR *)copy;
+ return Z_OK;
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inflate.h b/sys/contrib/opensolaris/uts/common/zmod/inflate.h
new file mode 100644
index 000000000000..4d28b221770b
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inflate.h
@@ -0,0 +1,117 @@
+/* inflate.h -- internal inflate state definition
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/* WARNING: this file should *not* be used by applications. It is
+ part of the implementation of the compression library and is
+ subject to change. Applications should only use zlib.h.
+ */
+
+/* define NO_GZIP when compiling if you want to disable gzip header and
+ trailer decoding by inflate(). NO_GZIP would be used to avoid linking in
+ the crc code when it is not needed. For shared libraries, gzip decoding
+ should be left enabled. */
+#ifndef NO_GZIP
+# define GUNZIP
+#endif
+
+/* Possible inflate modes between inflate() calls */
+typedef enum {
+ HEAD, /* i: waiting for magic header */
+ FLAGS, /* i: waiting for method and flags (gzip) */
+ TIME, /* i: waiting for modification time (gzip) */
+ OS, /* i: waiting for extra flags and operating system (gzip) */
+ EXLEN, /* i: waiting for extra length (gzip) */
+ EXTRA, /* i: waiting for extra bytes (gzip) */
+ NAME, /* i: waiting for end of file name (gzip) */
+ COMMENT, /* i: waiting for end of comment (gzip) */
+ HCRC, /* i: waiting for header crc (gzip) */
+ DICTID, /* i: waiting for dictionary check value */
+ DICT, /* waiting for inflateSetDictionary() call */
+ TYPE, /* i: waiting for type bits, including last-flag bit */
+ TYPEDO, /* i: same, but skip check to exit inflate on new block */
+ STORED, /* i: waiting for stored size (length and complement) */
+ COPY, /* i/o: waiting for input or output to copy stored block */
+ TABLE, /* i: waiting for dynamic block table lengths */
+ LENLENS, /* i: waiting for code length code lengths */
+ CODELENS, /* i: waiting for length/lit and distance code lengths */
+ LEN, /* i: waiting for length/lit code */
+ LENEXT, /* i: waiting for length extra bits */
+ DIST, /* i: waiting for distance code */
+ DISTEXT, /* i: waiting for distance extra bits */
+ MATCH, /* o: waiting for output space to copy string */
+ LIT, /* o: waiting for output space to write literal */
+ CHECK, /* i: waiting for 32-bit check value */
+ LENGTH, /* i: waiting for 32-bit length (gzip) */
+ DONE, /* finished check, done -- remain here until reset */
+ BAD, /* got a data error -- remain here until reset */
+ MEM, /* got an inflate() memory error -- remain here until reset */
+ SYNC /* looking for synchronization bytes to restart inflate() */
+} inflate_mode;
+
+/*
+ State transitions between above modes -
+
+ (most modes can go to the BAD or MEM mode -- not shown for clarity)
+
+ Process header:
+ HEAD -> (gzip) or (zlib)
+ (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME
+ NAME -> COMMENT -> HCRC -> TYPE
+ (zlib) -> DICTID or TYPE
+ DICTID -> DICT -> TYPE
+ Read deflate blocks:
+ TYPE -> STORED or TABLE or LEN or CHECK
+ STORED -> COPY -> TYPE
+ TABLE -> LENLENS -> CODELENS -> LEN
+ Read deflate codes:
+ LEN -> LENEXT or LIT or TYPE
+ LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
+ LIT -> LEN
+ Process trailer:
+ CHECK -> LENGTH -> DONE
+ */
+
+/* state maintained between inflate() calls. Approximately 7K bytes. */
+struct inflate_state {
+ inflate_mode mode; /* current inflate mode */
+ int last; /* true if processing last block */
+ int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
+ int havedict; /* true if dictionary provided */
+ int flags; /* gzip header method and flags (0 if zlib) */
+ unsigned dmax; /* zlib header max distance (INFLATE_STRICT) */
+ unsigned long check; /* protected copy of check value */
+ unsigned long total; /* protected copy of output count */
+ gz_headerp head; /* where to save gzip header information */
+ /* sliding window */
+ unsigned wbits; /* log base 2 of requested window size */
+ unsigned wsize; /* window size or zero if not using window */
+ unsigned whave; /* valid bytes in the window */
+ unsigned write; /* window write index */
+ unsigned char FAR *window; /* allocated sliding window, if needed */
+ /* bit accumulator */
+ unsigned long hold; /* input bit accumulator */
+ unsigned bits; /* number of bits in "in" */
+ /* for string and stored block copying */
+ unsigned length; /* literal or length of data to copy */
+ unsigned offset; /* distance back to copy string from */
+ /* for table and code decoding */
+ unsigned extra; /* extra bits needed */
+ /* fixed and dynamic code tables */
+ code const FAR *lencode; /* starting table for length/literal codes */
+ code const FAR *distcode; /* starting table for distance codes */
+ unsigned lenbits; /* index bits for lencode */
+ unsigned distbits; /* index bits for distcode */
+ /* dynamic table building */
+ unsigned ncode; /* number of code length code lengths */
+ unsigned nlen; /* number of length code lengths */
+ unsigned ndist; /* number of distance code lengths */
+ unsigned have; /* number of code lengths in lens[] */
+ code FAR *next; /* next available space in codes[] */
+ unsigned short lens[320]; /* temporary storage for code lengths */
+ unsigned short work[288]; /* work area for code table building */
+ code codes[ENOUGH]; /* space for code tables */
+};
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inftrees.c b/sys/contrib/opensolaris/uts/common/zmod/inftrees.c
new file mode 100644
index 000000000000..2d371675301c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inftrees.c
@@ -0,0 +1,331 @@
+/* inftrees.c -- generate Huffman trees for efficient decoding
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "zutil.h"
+#include "inftrees.h"
+
+#define MAXBITS 15
+
+static const char inflate_copyright[] =
+ " inflate 1.2.3 Copyright 1995-2005 Mark Adler ";
+/*
+ If you use the zlib library in a product, an acknowledgment is welcome
+ in the documentation of your product. If for some reason you cannot
+ include such an acknowledgment, I would appreciate that you keep this
+ copyright string in the executable of your product.
+ */
+
+/*
+ Build a set of tables to decode the provided canonical Huffman code.
+ The code lengths are lens[0..codes-1]. The result starts at *table,
+ whose indices are 0..2^bits-1. work is a writable array of at least
+ lens shorts, which is used as a work area. type is the type of code
+ to be generated, CODES, LENS, or DISTS. On return, zero is success,
+ -1 is an invalid code, and +1 means that ENOUGH isn't enough. table
+ on return points to the next available entry's address. bits is the
+ requested root table index bits, and on return it is the actual root
+ table index bits. It will differ if the request is greater than the
+ longest code or if it is less than the shortest code.
+ */
+int inflate_table(type, lens, codes, table, bits, work)
+codetype type;
+unsigned short FAR *lens;
+unsigned codes;
+code FAR * FAR *table;
+unsigned FAR *bits;
+unsigned short FAR *work;
+{
+ unsigned len; /* a code's length in bits */
+ unsigned sym; /* index of code symbols */
+ unsigned min, max; /* minimum and maximum code lengths */
+ unsigned root; /* number of index bits for root table */
+ unsigned curr; /* number of index bits for current table */
+ unsigned drop; /* code bits to drop for sub-table */
+ int left; /* number of prefix codes available */
+ unsigned used; /* code entries in table used */
+ unsigned huff; /* Huffman code */
+ unsigned incr; /* for incrementing code, index */
+ unsigned fill; /* index for replicating entries */
+ unsigned low; /* low bits for current root entry */
+ unsigned mask; /* mask for low root bits */
+ code this; /* table entry for duplication */
+ code FAR *next; /* next available space in table */
+ const unsigned short FAR *base; /* base value table to use */
+ const unsigned short FAR *extra; /* extra bits table to use */
+ int end; /* use base and extra for symbol > end */
+ unsigned short count[MAXBITS+1]; /* number of codes of each length */
+ unsigned short offs[MAXBITS+1]; /* offsets in table for each length */
+ static const unsigned short lbase[31] = { /* Length codes 257..285 base */
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+ 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ static const unsigned short lext[31] = { /* Length codes 257..285 extra */
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
+ 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196};
+ static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+ 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+ 8193, 12289, 16385, 24577, 0, 0};
+ static const unsigned short dext[32] = { /* Distance codes 0..29 extra */
+ 16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+ 23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
+ 28, 28, 29, 29, 64, 64};
+
+ /*
+ Process a set of code lengths to create a canonical Huffman code. The
+ code lengths are lens[0..codes-1]. Each length corresponds to the
+ symbols 0..codes-1. The Huffman code is generated by first sorting the
+ symbols by length from short to long, and retaining the symbol order
+ for codes with equal lengths. Then the code starts with all zero bits
+ for the first code of the shortest length, and the codes are integer
+ increments for the same length, and zeros are appended as the length
+ increases. For the deflate format, these bits are stored backwards
+ from their more natural integer increment ordering, and so when the
+ decoding tables are built in the large loop below, the integer codes
+ are incremented backwards.
+
+ This routine assumes, but does not check, that all of the entries in
+ lens[] are in the range 0..MAXBITS. The caller must assure this.
+ 1..MAXBITS is interpreted as that code length. zero means that that
+ symbol does not occur in this code.
+
+ The codes are sorted by computing a count of codes for each length,
+ creating from that a table of starting indices for each length in the
+ sorted table, and then entering the symbols in order in the sorted
+ table. The sorted table is work[], with that space being provided by
+ the caller.
+
+ The length counts are used for other purposes as well, i.e. finding
+ the minimum and maximum length codes, determining if there are any
+ codes at all, checking for a valid set of lengths, and looking ahead
+ at length counts to determine sub-table sizes when building the
+ decoding tables.
+ */
+
+ /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
+ for (len = 0; len <= MAXBITS; len++)
+ count[len] = 0;
+ for (sym = 0; sym < codes; sym++)
+ count[lens[sym]]++;
+
+ /* bound code lengths, force root to be within code lengths */
+ root = *bits;
+ for (max = MAXBITS; max >= 1; max--)
+ if (count[max] != 0) break;
+ if (root > max) root = max;
+ if (max == 0) { /* no symbols to code at all */
+ this.op = (unsigned char)64; /* invalid code marker */
+ this.bits = (unsigned char)1;
+ this.val = (unsigned short)0;
+ *(*table)++ = this; /* make a table to force an error */
+ *(*table)++ = this;
+ *bits = 1;
+ return 0; /* no symbols, but wait for decoding to report error */
+ }
+ for (min = 1; min <= MAXBITS; min++)
+ if (count[min] != 0) break;
+ if (root < min) root = min;
+
+ /* check for an over-subscribed or incomplete set of lengths */
+ left = 1;
+ for (len = 1; len <= MAXBITS; len++) {
+ left <<= 1;
+ left -= count[len];
+ if (left < 0) return -1; /* over-subscribed */
+ }
+ if (left > 0 && (type == CODES || max != 1))
+ return -1; /* incomplete set */
+
+ /* generate offsets into symbol table for each length for sorting */
+ offs[1] = 0;
+ for (len = 1; len < MAXBITS; len++)
+ offs[len + 1] = offs[len] + count[len];
+
+ /* sort symbols by length, by symbol order within each length */
+ for (sym = 0; sym < codes; sym++)
+ if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+
+ /*
+ Create and fill in decoding tables. In this loop, the table being
+ filled is at next and has curr index bits. The code being used is huff
+ with length len. That code is converted to an index by dropping drop
+ bits off of the bottom. For codes where len is less than drop + curr,
+ those top drop + curr - len bits are incremented through all values to
+ fill the table with replicated entries.
+
+ root is the number of index bits for the root table. When len exceeds
+ root, sub-tables are created pointed to by the root entry with an index
+ of the low root bits of huff. This is saved in low to check for when a
+ new sub-table should be started. drop is zero when the root table is
+ being filled, and drop is root when sub-tables are being filled.
+
+ When a new sub-table is needed, it is necessary to look ahead in the
+ code lengths to determine what size sub-table is needed. The length
+ counts are used for this, and so count[] is decremented as codes are
+ entered in the tables.
+
+ used keeps track of how many table entries have been allocated from the
+ provided *table space. It is checked when a LENS table is being made
+ against the space in *table, ENOUGH, minus the maximum space needed by
+ the worst case distance code, MAXD. This should never happen, but the
+ sufficiency of ENOUGH has not been proven exhaustively, hence the check.
+ This assumes that when type == LENS, bits == 9.
+
+ sym increments through all symbols, and the loop terminates when
+ all codes of length max, i.e. all codes, have been processed. This
+ routine permits incomplete codes, so another loop after this one fills
+ in the rest of the decoding tables with invalid code markers.
+ */
+
+ /* set up for code type */
+ switch (type) {
+ case CODES:
+ base = extra = work; /* dummy value--not used */
+ end = 19;
+ break;
+ case LENS:
+ base = lbase;
+ base -= 257;
+ extra = lext;
+ extra -= 257;
+ end = 256;
+ break;
+ default: /* DISTS */
+ base = dbase;
+ extra = dext;
+ end = -1;
+ }
+
+ /* initialize state for loop */
+ huff = 0; /* starting code */
+ sym = 0; /* starting code symbol */
+ len = min; /* starting code length */
+ next = *table; /* current table to fill in */
+ curr = root; /* current table index bits */
+ drop = 0; /* current bits to drop from code for index */
+ low = (unsigned)(-1); /* trigger new sub-table when len > root */
+ used = 1U << root; /* use root table entries */
+ mask = used - 1; /* mask for comparing low */
+
+ /* check available table space */
+ if (type == LENS && used >= ENOUGH - MAXD)
+ return 1;
+
+ /* process all codes and make table entries */
+ for (;;) {
+ /* create table entry */
+ this.bits = (unsigned char)(len - drop);
+ if ((int)(work[sym]) < end) {
+ this.op = (unsigned char)0;
+ this.val = work[sym];
+ }
+ else if ((int)(work[sym]) > end) {
+ this.op = (unsigned char)(extra[work[sym]]);
+ this.val = base[work[sym]];
+ }
+ else {
+ this.op = (unsigned char)(32 + 64); /* end of block */
+ this.val = 0;
+ }
+
+ /* replicate for those indices with low len bits equal to huff */
+ incr = 1U << (len - drop);
+ fill = 1U << curr;
+ min = fill; /* save offset to next table */
+ do {
+ fill -= incr;
+ next[(huff >> drop) + fill] = this;
+ } while (fill != 0);
+
+ /* backwards increment the len-bit code huff */
+ incr = 1U << (len - 1);
+ while (huff & incr)
+ incr >>= 1;
+ if (incr != 0) {
+ huff &= incr - 1;
+ huff += incr;
+ }
+ else
+ huff = 0;
+
+ /* go to next symbol, update count, len */
+ sym++;
+ if (--(count[len]) == 0) {
+ if (len == max) break;
+ len = lens[work[sym]];
+ }
+
+ /* create new sub-table if needed */
+ if (len > root && (huff & mask) != low) {
+ /* if first time, transition to sub-tables */
+ if (drop == 0)
+ drop = root;
+
+ /* increment past last table */
+ next += min; /* here min is 1 << curr */
+
+ /* determine length of next table */
+ curr = len - drop;
+ left = (int)(1 << curr);
+ while (curr + drop < max) {
+ left -= count[curr + drop];
+ if (left <= 0) break;
+ curr++;
+ left <<= 1;
+ }
+
+ /* check for enough space */
+ used += 1U << curr;
+ if (type == LENS && used >= ENOUGH - MAXD)
+ return 1;
+
+ /* point entry in root table to sub-table */
+ low = huff & mask;
+ (*table)[low].op = (unsigned char)curr;
+ (*table)[low].bits = (unsigned char)root;
+ (*table)[low].val = (unsigned short)(next - *table);
+ }
+ }
+
+ /*
+ Fill in rest of table for incomplete codes. This loop is similar to the
+ loop above in incrementing huff for table indices. It is assumed that
+ len is equal to curr + drop, so there is no loop needed to increment
+ through high index bits. When the current sub-table is filled, the loop
+ drops back to the root table to fill in any remaining entries there.
+ */
+ this.op = (unsigned char)64; /* invalid code marker */
+ this.bits = (unsigned char)(len - drop);
+ this.val = (unsigned short)0;
+ while (huff != 0) {
+ /* when done with sub-table, drop back to root table */
+ if (drop != 0 && (huff & mask) != low) {
+ drop = 0;
+ len = root;
+ next = *table;
+ this.bits = (unsigned char)len;
+ }
+
+ /* put invalid code marker in table */
+ next[huff >> drop] = this;
+
+ /* backwards increment the len-bit code huff */
+ incr = 1U << (len - 1);
+ while (huff & incr)
+ incr >>= 1;
+ if (incr != 0) {
+ huff &= incr - 1;
+ huff += incr;
+ }
+ else
+ huff = 0;
+ }
+
+ /* set return parameters */
+ *table += used;
+ *bits = root;
+ return 0;
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inftrees.h b/sys/contrib/opensolaris/uts/common/zmod/inftrees.h
new file mode 100644
index 000000000000..546e8c082fdb
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/inftrees.h
@@ -0,0 +1,57 @@
+/* inftrees.h -- header to use inftrees.c
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/* WARNING: this file should *not* be used by applications. It is
+ part of the implementation of the compression library and is
+ subject to change. Applications should only use zlib.h.
+ */
+
+/* Structure for decoding tables. Each entry provides either the
+ information needed to do the operation requested by the code that
+ indexed that table entry, or it provides a pointer to another
+ table that indexes more bits of the code. op indicates whether
+ the entry is a pointer to another table, a literal, a length or
+ distance, an end-of-block, or an invalid code. For a table
+ pointer, the low four bits of op is the number of index bits of
+ that table. For a length or distance, the low four bits of op
+ is the number of extra bits to get after the code. bits is
+ the number of bits in this code or part of the code to drop off
+ of the bit buffer. val is the actual byte to output in the case
+ of a literal, the base length or distance, or the offset from
+ the current table to the next table. Each entry is four bytes. */
+typedef struct {
+ unsigned char op; /* operation, extra bits, table bits */
+ unsigned char bits; /* bits in this part of the code */
+ unsigned short val; /* offset in table or code value */
+} code;
+
+/* op values as set by inflate_table():
+ 00000000 - literal
+ 0000tttt - table link, tttt != 0 is the number of table index bits
+ 0001eeee - length or distance, eeee is the number of extra bits
+ 01100000 - end of block
+ 01000000 - invalid code
+ */
+
+/* Maximum size of dynamic tree. The maximum found in a long but non-
+ exhaustive search was 1444 code structures (852 for length/literals
+ and 592 for distances, the latter actually the result of an
+ exhaustive search). The true maximum is not known, but the value
+ below is more than safe. */
+#define ENOUGH 2048
+#define MAXD 592
+
+/* Type of code to build for inftable() */
+typedef enum {
+ CODES,
+ LENS,
+ DISTS
+} codetype;
+
+extern int inflate_table OF((codetype type, unsigned short FAR *lens,
+ unsigned codes, code FAR * FAR *table,
+ unsigned FAR *bits, unsigned short FAR *work));
diff --git a/sys/contrib/opensolaris/uts/common/zmod/trees.c b/sys/contrib/opensolaris/uts/common/zmod/trees.c
new file mode 100644
index 000000000000..ce0cebc0304c
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/trees.c
@@ -0,0 +1,1219 @@
+/* trees.c -- output deflated data using Huffman coding
+ * Copyright (C) 1995-2005 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ALGORITHM
+ *
+ * The "deflation" process uses several Huffman trees. The more
+ * common source values are represented by shorter bit sequences.
+ *
+ * Each code tree is stored in a compressed form which is itself
+ * a Huffman encoding of the lengths of all the code strings (in
+ * ascending order by source values). The actual code strings are
+ * reconstructed from the lengths in the inflate process, as described
+ * in the deflate specification.
+ *
+ * REFERENCES
+ *
+ * Deutsch, L.P.,"'Deflate' Compressed Data Format Specification".
+ * Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc
+ *
+ * Storer, James A.
+ * Data Compression: Methods and Theory, pp. 49-50.
+ * Computer Science Press, 1988. ISBN 0-7167-8156-5.
+ *
+ * Sedgewick, R.
+ * Algorithms, p290.
+ * Addison-Wesley, 1983. ISBN 0-201-06672-6.
+ */
+
+/* #define GEN_TREES_H */
+
+#include "deflate.h"
+
+#ifdef DEBUG
+# include <ctype.h>
+#endif
+
+/* ===========================================================================
+ * Constants
+ */
+
+#define MAX_BL_BITS 7
+/* Bit length codes must not exceed MAX_BL_BITS bits */
+
+#define END_BLOCK 256
+/* end of block literal code */
+
+#define REP_3_6 16
+/* repeat previous bit length 3-6 times (2 bits of repeat count) */
+
+#define REPZ_3_10 17
+/* repeat a zero length 3-10 times (3 bits of repeat count) */
+
+#define REPZ_11_138 18
+/* repeat a zero length 11-138 times (7 bits of repeat count) */
+
+local const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */
+ = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0};
+
+local const int extra_dbits[D_CODES] /* extra bits for each distance code */
+ = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+local const int extra_blbits[BL_CODES]/* extra bits for each bit length code */
+ = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7};
+
+local const uch bl_order[BL_CODES]
+ = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15};
+/* The lengths of the bit length codes are sent in order of decreasing
+ * probability, to avoid transmitting the lengths for unused bit length codes.
+ */
+
+#define Buf_size (8 * 2*sizeof(char))
+/* Number of bits used within bi_buf. (bi_buf might be implemented on
+ * more than 16 bits on some systems.)
+ */
+
+/* ===========================================================================
+ * Local data. These are initialized only once.
+ */
+
+#define DIST_CODE_LEN 512 /* see definition of array dist_code below */
+
+#if defined(GEN_TREES_H) || !defined(STDC)
+/* non ANSI compilers may not accept trees.h */
+
+local ct_data static_ltree[L_CODES+2];
+/* The static literal tree. Since the bit lengths are imposed, there is no
+ * need for the L_CODES extra codes used during heap construction. However
+ * The codes 286 and 287 are needed to build a canonical tree (see _tr_init
+ * below).
+ */
+
+local ct_data static_dtree[D_CODES];
+/* The static distance tree. (Actually a trivial tree since all codes use
+ * 5 bits.)
+ */
+
+uch _dist_code[DIST_CODE_LEN];
+/* Distance codes. The first 256 values correspond to the distances
+ * 3 .. 258, the last 256 values correspond to the top 8 bits of
+ * the 15 bit distances.
+ */
+
+uch _length_code[MAX_MATCH-MIN_MATCH+1];
+/* length code for each normalized match length (0 == MIN_MATCH) */
+
+local int base_length[LENGTH_CODES];
+/* First normalized length for each code (0 = MIN_MATCH) */
+
+local int base_dist[D_CODES];
+/* First normalized distance for each code (0 = distance of 1) */
+
+#else
+# include "trees.h"
+#endif /* GEN_TREES_H */
+
+struct static_tree_desc_s {
+ const ct_data *static_tree; /* static tree or NULL */
+ const intf *extra_bits; /* extra bits for each code or NULL */
+ int extra_base; /* base index for extra_bits */
+ int elems; /* max number of elements in the tree */
+ int max_length; /* max bit length for the codes */
+};
+
+local static_tree_desc static_l_desc =
+{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS};
+
+local static_tree_desc static_d_desc =
+{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS};
+
+local static_tree_desc static_bl_desc =
+{(const ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS};
+
+/* ===========================================================================
+ * Local (static) routines in this file.
+ */
+
+local void tr_static_init OF((void));
+local void init_block OF((deflate_state *s));
+local void pqdownheap OF((deflate_state *s, ct_data *tree, int k));
+local void gen_bitlen OF((deflate_state *s, tree_desc *desc));
+local void gen_codes OF((ct_data *tree, int max_code, ushf *bl_count));
+local void build_tree OF((deflate_state *s, tree_desc *desc));
+local void scan_tree OF((deflate_state *s, ct_data *tree, int max_code));
+local void send_tree OF((deflate_state *s, ct_data *tree, int max_code));
+local int build_bl_tree OF((deflate_state *s));
+local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes,
+ int blcodes));
+local void compress_block OF((deflate_state *s, ct_data *ltree,
+ ct_data *dtree));
+local void set_data_type OF((deflate_state *s));
+local unsigned bi_reverse OF((unsigned value, int length));
+local void bi_windup OF((deflate_state *s));
+local void bi_flush OF((deflate_state *s));
+local void copy_block OF((deflate_state *s, charf *buf, unsigned len,
+ int header));
+
+#ifdef GEN_TREES_H
+local void gen_trees_header OF((void));
+#endif
+
+#ifndef DEBUG
+# define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len)
+ /* Send a code of the given tree. c and tree must not have side effects */
+
+#else /* DEBUG */
+# define send_code(s, c, tree) \
+ { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \
+ send_bits(s, tree[c].Code, tree[c].Len); }
+#endif
+
+/* ===========================================================================
+ * Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pendingBuf.
+ */
+#define put_short(s, w) { \
+ put_byte(s, (uch)((w) & 0xff)); \
+ put_byte(s, (uch)((ush)(w) >> 8)); \
+}
+
+/* ===========================================================================
+ * Send a value on a given number of bits.
+ * IN assertion: length <= 16 and value fits in length bits.
+ */
+#ifdef DEBUG
+local void send_bits OF((deflate_state *s, int value, int length));
+
+local void send_bits(s, value, length)
+ deflate_state *s;
+ int value; /* value to send */
+ int length; /* number of bits */
+{
+ Tracevv((stderr," l %2d v %4x ", length, value));
+ Assert(length > 0 && length <= 15, "invalid length");
+ s->bits_sent += (ulg)length;
+
+ /* If not enough room in bi_buf, use (valid) bits from bi_buf and
+ * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
+ * unused bits in value.
+ */
+ if (s->bi_valid > (int)Buf_size - length) {
+ s->bi_buf |= (value << s->bi_valid);
+ put_short(s, s->bi_buf);
+ s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
+ s->bi_valid += length - Buf_size;
+ } else {
+ s->bi_buf |= value << s->bi_valid;
+ s->bi_valid += length;
+ }
+}
+#else /* !DEBUG */
+
+#define send_bits(s, value, length) \
+{ int len = length;\
+ if (s->bi_valid > (int)Buf_size - len) {\
+ int val = value;\
+ s->bi_buf |= (val << s->bi_valid);\
+ put_short(s, s->bi_buf);\
+ s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
+ s->bi_valid += len - Buf_size;\
+ } else {\
+ s->bi_buf |= (value) << s->bi_valid;\
+ s->bi_valid += len;\
+ }\
+}
+#endif /* DEBUG */
+
+
+/* the arguments must not have side effects */
+
+/* ===========================================================================
+ * Initialize the various 'constant' tables.
+ */
+local void tr_static_init()
+{
+#if defined(GEN_TREES_H) || !defined(STDC)
+ static int static_init_done = 0;
+ int n; /* iterates over tree elements */
+ int bits; /* bit counter */
+ int length; /* length value */
+ int code; /* code value */
+ int dist; /* distance index */
+ ush bl_count[MAX_BITS+1];
+ /* number of codes at each bit length for an optimal tree */
+
+ if (static_init_done) return;
+
+ /* For some embedded targets, global variables are not initialized: */
+ static_l_desc.static_tree = static_ltree;
+ static_l_desc.extra_bits = extra_lbits;
+ static_d_desc.static_tree = static_dtree;
+ static_d_desc.extra_bits = extra_dbits;
+ static_bl_desc.extra_bits = extra_blbits;
+
+ /* Initialize the mapping length (0..255) -> length code (0..28) */
+ length = 0;
+ for (code = 0; code < LENGTH_CODES-1; code++) {
+ base_length[code] = length;
+ for (n = 0; n < (1<<extra_lbits[code]); n++) {
+ _length_code[length++] = (uch)code;
+ }
+ }
+ Assert (length == 256, "tr_static_init: length != 256");
+ /* Note that the length 255 (match length 258) can be represented
+ * in two different ways: code 284 + 5 bits or code 285, so we
+ * overwrite length_code[255] to use the best encoding:
+ */
+ _length_code[length-1] = (uch)code;
+
+ /* Initialize the mapping dist (0..32K) -> dist code (0..29) */
+ dist = 0;
+ for (code = 0 ; code < 16; code++) {
+ base_dist[code] = dist;
+ for (n = 0; n < (1<<extra_dbits[code]); n++) {
+ _dist_code[dist++] = (uch)code;
+ }
+ }
+ Assert (dist == 256, "tr_static_init: dist != 256");
+ dist >>= 7; /* from now on, all distances are divided by 128 */
+ for ( ; code < D_CODES; code++) {
+ base_dist[code] = dist << 7;
+ for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
+ _dist_code[256 + dist++] = (uch)code;
+ }
+ }
+ Assert (dist == 256, "tr_static_init: 256+dist != 512");
+
+ /* Construct the codes of the static literal tree */
+ for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
+ n = 0;
+ while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++;
+ while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++;
+ while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++;
+ while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++;
+ /* Codes 286 and 287 do not exist, but we must include them in the
+ * tree construction to get a canonical Huffman tree (longest code
+ * all ones)
+ */
+ gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count);
+
+ /* The static distance tree is trivial: */
+ for (n = 0; n < D_CODES; n++) {
+ static_dtree[n].Len = 5;
+ static_dtree[n].Code = bi_reverse((unsigned)n, 5);
+ }
+ static_init_done = 1;
+
+# ifdef GEN_TREES_H
+ gen_trees_header();
+# endif
+#endif /* defined(GEN_TREES_H) || !defined(STDC) */
+}
+
+/* ===========================================================================
+ * Genererate the file trees.h describing the static trees.
+ */
+#ifdef GEN_TREES_H
+# ifndef DEBUG
+# include <stdio.h>
+# endif
+
+# define SEPARATOR(i, last, width) \
+ ((i) == (last)? "\n};\n\n" : \
+ ((i) % (width) == (width)-1 ? ",\n" : ", "))
+
+void gen_trees_header()
+{
+ FILE *header = fopen("trees.h", "w");
+ int i;
+
+ Assert (header != NULL, "Can't open trees.h");
+ fprintf(header,
+ "/* header created automatically with -DGEN_TREES_H */\n\n");
+
+ fprintf(header, "local const ct_data static_ltree[L_CODES+2] = {\n");
+ for (i = 0; i < L_CODES+2; i++) {
+ fprintf(header, "{{%3u},{%3u}}%s", static_ltree[i].Code,
+ static_ltree[i].Len, SEPARATOR(i, L_CODES+1, 5));
+ }
+
+ fprintf(header, "local const ct_data static_dtree[D_CODES] = {\n");
+ for (i = 0; i < D_CODES; i++) {
+ fprintf(header, "{{%2u},{%2u}}%s", static_dtree[i].Code,
+ static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5));
+ }
+
+ fprintf(header, "const uch _dist_code[DIST_CODE_LEN] = {\n");
+ for (i = 0; i < DIST_CODE_LEN; i++) {
+ fprintf(header, "%2u%s", _dist_code[i],
+ SEPARATOR(i, DIST_CODE_LEN-1, 20));
+ }
+
+ fprintf(header, "const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {\n");
+ for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) {
+ fprintf(header, "%2u%s", _length_code[i],
+ SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20));
+ }
+
+ fprintf(header, "local const int base_length[LENGTH_CODES] = {\n");
+ for (i = 0; i < LENGTH_CODES; i++) {
+ fprintf(header, "%1u%s", base_length[i],
+ SEPARATOR(i, LENGTH_CODES-1, 20));
+ }
+
+ fprintf(header, "local const int base_dist[D_CODES] = {\n");
+ for (i = 0; i < D_CODES; i++) {
+ fprintf(header, "%5u%s", base_dist[i],
+ SEPARATOR(i, D_CODES-1, 10));
+ }
+
+ fclose(header);
+}
+#endif /* GEN_TREES_H */
+
+/* ===========================================================================
+ * Initialize the tree data structures for a new zlib stream.
+ */
+void _tr_init(s)
+ deflate_state *s;
+{
+ tr_static_init();
+
+ s->l_desc.dyn_tree = s->dyn_ltree;
+ s->l_desc.stat_desc = &static_l_desc;
+
+ s->d_desc.dyn_tree = s->dyn_dtree;
+ s->d_desc.stat_desc = &static_d_desc;
+
+ s->bl_desc.dyn_tree = s->bl_tree;
+ s->bl_desc.stat_desc = &static_bl_desc;
+
+ s->bi_buf = 0;
+ s->bi_valid = 0;
+ s->last_eob_len = 8; /* enough lookahead for inflate */
+#ifdef DEBUG
+ s->compressed_len = 0L;
+ s->bits_sent = 0L;
+#endif
+
+ /* Initialize the first block of the first file: */
+ init_block(s);
+}
+
+/* ===========================================================================
+ * Initialize a new block.
+ */
+local void init_block(s)
+ deflate_state *s;
+{
+ int n; /* iterates over tree elements */
+
+ /* Initialize the trees. */
+ for (n = 0; n < L_CODES; n++) s->dyn_ltree[n].Freq = 0;
+ for (n = 0; n < D_CODES; n++) s->dyn_dtree[n].Freq = 0;
+ for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0;
+
+ s->dyn_ltree[END_BLOCK].Freq = 1;
+ s->opt_len = s->static_len = 0L;
+ s->last_lit = s->matches = 0;
+}
+
+#define SMALLEST 1
+/* Index within the heap array of least frequent node in the Huffman tree */
+
+
+/* ===========================================================================
+ * Remove the smallest element from the heap and recreate the heap with
+ * one less element. Updates heap and heap_len.
+ */
+#define pqremove(s, tree, top) \
+{\
+ top = s->heap[SMALLEST]; \
+ s->heap[SMALLEST] = s->heap[s->heap_len--]; \
+ pqdownheap(s, tree, SMALLEST); \
+}
+
+/* ===========================================================================
+ * Compares to subtrees, using the tree depth as tie breaker when
+ * the subtrees have equal frequency. This minimizes the worst case length.
+ */
+#define smaller(tree, n, m, depth) \
+ (tree[n].Freq < tree[m].Freq || \
+ (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m]))
+
+/* ===========================================================================
+ * Restore the heap property by moving down the tree starting at node k,
+ * exchanging a node with the smallest of its two sons if necessary, stopping
+ * when the heap property is re-established (each father smaller than its
+ * two sons).
+ */
+local void pqdownheap(s, tree, k)
+ deflate_state *s;
+ ct_data *tree; /* the tree to restore */
+ int k; /* node to move down */
+{
+ int v = s->heap[k];
+ int j = k << 1; /* left son of k */
+ while (j <= s->heap_len) {
+ /* Set j to the smallest of the two sons: */
+ if (j < s->heap_len &&
+ smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
+ j++;
+ }
+ /* Exit if v is smaller than both sons */
+ if (smaller(tree, v, s->heap[j], s->depth)) break;
+
+ /* Exchange v with the smallest son */
+ s->heap[k] = s->heap[j]; k = j;
+
+ /* And continue down the tree, setting j to the left son of k */
+ j <<= 1;
+ }
+ s->heap[k] = v;
+}
+
+/* ===========================================================================
+ * Compute the optimal bit lengths for a tree and update the total bit length
+ * for the current block.
+ * IN assertion: the fields freq and dad are set, heap[heap_max] and
+ * above are the tree nodes sorted by increasing frequency.
+ * OUT assertions: the field len is set to the optimal bit length, the
+ * array bl_count contains the frequencies for each bit length.
+ * The length opt_len is updated; static_len is also updated if stree is
+ * not null.
+ */
+local void gen_bitlen(s, desc)
+ deflate_state *s;
+ tree_desc *desc; /* the tree descriptor */
+{
+ ct_data *tree = desc->dyn_tree;
+ int max_code = desc->max_code;
+ const ct_data *stree = desc->stat_desc->static_tree;
+ const intf *extra = desc->stat_desc->extra_bits;
+ int base = desc->stat_desc->extra_base;
+ int max_length = desc->stat_desc->max_length;
+ int h; /* heap index */
+ int n, m; /* iterate over the tree elements */
+ int bits; /* bit length */
+ int xbits; /* extra bits */
+ ush f; /* frequency */
+ int overflow = 0; /* number of elements with bit length too large */
+
+ for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0;
+
+ /* In a first pass, compute the optimal bit lengths (which may
+ * overflow in the case of the bit length tree).
+ */
+ tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
+
+ for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
+ n = s->heap[h];
+ bits = tree[tree[n].Dad].Len + 1;
+ if (bits > max_length) bits = max_length, overflow++;
+ tree[n].Len = (ush)bits;
+ /* We overwrite tree[n].Dad which is no longer needed */
+
+ if (n > max_code) continue; /* not a leaf node */
+
+ s->bl_count[bits]++;
+ xbits = 0;
+ if (n >= base) xbits = extra[n-base];
+ f = tree[n].Freq;
+ s->opt_len += (ulg)f * (bits + xbits);
+ if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits);
+ }
+ if (overflow == 0) return;
+
+ Trace((stderr,"\nbit length overflow\n"));
+ /* This happens for example on obj2 and pic of the Calgary corpus */
+
+ /* Find the first bit length which could increase: */
+ do {
+ bits = max_length-1;
+ while (s->bl_count[bits] == 0) bits--;
+ s->bl_count[bits]--; /* move one leaf down the tree */
+ s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
+ s->bl_count[max_length]--;
+ /* The brother of the overflow item also moves one step up,
+ * but this does not affect bl_count[max_length]
+ */
+ overflow -= 2;
+ } while (overflow > 0);
+
+ /* Now recompute all bit lengths, scanning in increasing frequency.
+ * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
+ * lengths instead of fixing only the wrong ones. This idea is taken
+ * from 'ar' written by Haruhiko Okumura.)
+ */
+ for (bits = max_length; bits != 0; bits--) {
+ n = s->bl_count[bits];
+ while (n != 0) {
+ m = s->heap[--h];
+ if (m > max_code) continue;
+ if ((unsigned) tree[m].Len != (unsigned) bits) {
+ Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits));
+ s->opt_len += ((long)bits - (long)tree[m].Len)
+ *(long)tree[m].Freq;
+ tree[m].Len = (ush)bits;
+ }
+ n--;
+ }
+ }
+}
+
+/* ===========================================================================
+ * Generate the codes for a given tree and bit counts (which need not be
+ * optimal).
+ * IN assertion: the array bl_count contains the bit length statistics for
+ * the given tree and the field len is set for all tree elements.
+ * OUT assertion: the field code is set for all tree elements of non
+ * zero code length.
+ */
+local void gen_codes (tree, max_code, bl_count)
+ ct_data *tree; /* the tree to decorate */
+ int max_code; /* largest code with non zero frequency */
+ ushf *bl_count; /* number of codes at each bit length */
+{
+ ush next_code[MAX_BITS+1]; /* next code value for each bit length */
+ ush code = 0; /* running code value */
+ int bits; /* bit index */
+ int n; /* code index */
+
+ /* The distribution counts are first used to generate the code values
+ * without bit reversal.
+ */
+ for (bits = 1; bits <= MAX_BITS; bits++) {
+ next_code[bits] = code = (code + bl_count[bits-1]) << 1;
+ }
+ /* Check that the bit counts in bl_count are consistent. The last code
+ * must be all ones.
+ */
+ Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
+ "inconsistent bit counts");
+ Tracev((stderr,"\ngen_codes: max_code %d ", max_code));
+
+ for (n = 0; n <= max_code; n++) {
+ int len = tree[n].Len;
+ if (len == 0) continue;
+ /* Now reverse the bits */
+ tree[n].Code = bi_reverse(next_code[len]++, len);
+
+ Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
+ n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
+ }
+}
+
+/* ===========================================================================
+ * Construct one Huffman tree and assigns the code bit strings and lengths.
+ * Update the total bit length for the current block.
+ * IN assertion: the field freq is set for all tree elements.
+ * OUT assertions: the fields len and code are set to the optimal bit length
+ * and corresponding code. The length opt_len is updated; static_len is
+ * also updated if stree is not null. The field max_code is set.
+ */
+local void build_tree(s, desc)
+ deflate_state *s;
+ tree_desc *desc; /* the tree descriptor */
+{
+ ct_data *tree = desc->dyn_tree;
+ const ct_data *stree = desc->stat_desc->static_tree;
+ int elems = desc->stat_desc->elems;
+ int n, m; /* iterate over heap elements */
+ int max_code = -1; /* largest code with non zero frequency */
+ int node; /* new node being created */
+
+ /* Construct the initial heap, with least frequent element in
+ * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
+ * heap[0] is not used.
+ */
+ s->heap_len = 0, s->heap_max = HEAP_SIZE;
+
+ for (n = 0; n < elems; n++) {
+ if (tree[n].Freq != 0) {
+ s->heap[++(s->heap_len)] = max_code = n;
+ s->depth[n] = 0;
+ } else {
+ tree[n].Len = 0;
+ }
+ }
+
+ /* The pkzip format requires that at least one distance code exists,
+ * and that at least one bit should be sent even if there is only one
+ * possible code. So to avoid special checks later on we force at least
+ * two codes of non zero frequency.
+ */
+ while (s->heap_len < 2) {
+ node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0);
+ tree[node].Freq = 1;
+ s->depth[node] = 0;
+ s->opt_len--; if (stree) s->static_len -= stree[node].Len;
+ /* node is 0 or 1 so it does not have extra bits */
+ }
+ desc->max_code = max_code;
+
+ /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
+ * establish sub-heaps of increasing lengths:
+ */
+ for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
+
+ /* Construct the Huffman tree by repeatedly combining the least two
+ * frequent nodes.
+ */
+ node = elems; /* next internal node of the tree */
+ do {
+ pqremove(s, tree, n); /* n = node of least frequency */
+ m = s->heap[SMALLEST]; /* m = node of next least frequency */
+
+ s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */
+ s->heap[--(s->heap_max)] = m;
+
+ /* Create a new node father of n and m */
+ tree[node].Freq = tree[n].Freq + tree[m].Freq;
+ s->depth[node] = (uch)((s->depth[n] >= s->depth[m] ?
+ s->depth[n] : s->depth[m]) + 1);
+ tree[n].Dad = tree[m].Dad = (ush)node;
+#ifdef DUMP_BL_TREE
+ if (tree == s->bl_tree) {
+ fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)",
+ node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq);
+ }
+#endif
+ /* and insert the new node in the heap */
+ s->heap[SMALLEST] = node++;
+ pqdownheap(s, tree, SMALLEST);
+
+ } while (s->heap_len >= 2);
+
+ s->heap[--(s->heap_max)] = s->heap[SMALLEST];
+
+ /* At this point, the fields freq and dad are set. We can now
+ * generate the bit lengths.
+ */
+ gen_bitlen(s, (tree_desc *)desc);
+
+ /* The field len is now set, we can generate the bit codes */
+ gen_codes ((ct_data *)tree, max_code, s->bl_count);
+}
+
+/* ===========================================================================
+ * Scan a literal or distance tree to determine the frequencies of the codes
+ * in the bit length tree.
+ */
+local void scan_tree (s, tree, max_code)
+ deflate_state *s;
+ ct_data *tree; /* the tree to be scanned */
+ int max_code; /* and its largest code of non zero frequency */
+{
+ int n; /* iterates over all tree elements */
+ int prevlen = -1; /* last emitted length */
+ int curlen; /* length of current code */
+ int nextlen = tree[0].Len; /* length of next code */
+ int count = 0; /* repeat count of the current code */
+ int max_count = 7; /* max repeat count */
+ int min_count = 4; /* min repeat count */
+
+ if (nextlen == 0) max_count = 138, min_count = 3;
+ tree[max_code+1].Len = (ush)0xffff; /* guard */
+
+ for (n = 0; n <= max_code; n++) {
+ curlen = nextlen; nextlen = tree[n+1].Len;
+ if (++count < max_count && curlen == nextlen) {
+ continue;
+ } else if (count < min_count) {
+ s->bl_tree[curlen].Freq += count;
+ } else if (curlen != 0) {
+ if (curlen != prevlen) s->bl_tree[curlen].Freq++;
+ s->bl_tree[REP_3_6].Freq++;
+ } else if (count <= 10) {
+ s->bl_tree[REPZ_3_10].Freq++;
+ } else {
+ s->bl_tree[REPZ_11_138].Freq++;
+ }
+ count = 0; prevlen = curlen;
+ if (nextlen == 0) {
+ max_count = 138, min_count = 3;
+ } else if (curlen == nextlen) {
+ max_count = 6, min_count = 3;
+ } else {
+ max_count = 7, min_count = 4;
+ }
+ }
+}
+
+/* ===========================================================================
+ * Send a literal or distance tree in compressed form, using the codes in
+ * bl_tree.
+ */
+local void send_tree (s, tree, max_code)
+ deflate_state *s;
+ ct_data *tree; /* the tree to be scanned */
+ int max_code; /* and its largest code of non zero frequency */
+{
+ int n; /* iterates over all tree elements */
+ int prevlen = -1; /* last emitted length */
+ int curlen; /* length of current code */
+ int nextlen = tree[0].Len; /* length of next code */
+ int count = 0; /* repeat count of the current code */
+ int max_count = 7; /* max repeat count */
+ int min_count = 4; /* min repeat count */
+
+ /* tree[max_code+1].Len = -1; */ /* guard already set */
+ if (nextlen == 0) max_count = 138, min_count = 3;
+
+ for (n = 0; n <= max_code; n++) {
+ curlen = nextlen; nextlen = tree[n+1].Len;
+ if (++count < max_count && curlen == nextlen) {
+ continue;
+ } else if (count < min_count) {
+ do { send_code(s, curlen, s->bl_tree); } while (--count != 0);
+
+ } else if (curlen != 0) {
+ if (curlen != prevlen) {
+ send_code(s, curlen, s->bl_tree); count--;
+ }
+ Assert(count >= 3 && count <= 6, " 3_6?");
+ send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
+
+ } else if (count <= 10) {
+ send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
+
+ } else {
+ send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
+ }
+ count = 0; prevlen = curlen;
+ if (nextlen == 0) {
+ max_count = 138, min_count = 3;
+ } else if (curlen == nextlen) {
+ max_count = 6, min_count = 3;
+ } else {
+ max_count = 7, min_count = 4;
+ }
+ }
+}
+
+/* ===========================================================================
+ * Construct the Huffman tree for the bit lengths and return the index in
+ * bl_order of the last bit length code to send.
+ */
+local int build_bl_tree(s)
+ deflate_state *s;
+{
+ int max_blindex; /* index of last bit length code of non zero freq */
+
+ /* Determine the bit length frequencies for literal and distance trees */
+ scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code);
+ scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code);
+
+ /* Build the bit length tree: */
+ build_tree(s, (tree_desc *)(&(s->bl_desc)));
+ /* opt_len now includes the length of the tree representations, except
+ * the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
+ */
+
+ /* Determine the number of bit length codes to send. The pkzip format
+ * requires that at least 4 bit length codes be sent. (appnote.txt says
+ * 3 but the actual value used is 4.)
+ */
+ for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) {
+ if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
+ }
+ /* Update opt_len to include the bit length tree and counts */
+ s->opt_len += 3*(max_blindex+1) + 5+5+4;
+ Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld",
+ s->opt_len, s->static_len));
+
+ return max_blindex;
+}
+
+/* ===========================================================================
+ * Send the header for a block using dynamic Huffman trees: the counts, the
+ * lengths of the bit length codes, the literal tree and the distance tree.
+ * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
+ */
+local void send_all_trees(s, lcodes, dcodes, blcodes)
+ deflate_state *s;
+ int lcodes, dcodes, blcodes; /* number of codes for each tree */
+{
+ int rank; /* index in bl_order */
+
+ Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
+ Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
+ "too many codes");
+ Tracev((stderr, "\nbl counts: "));
+ send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
+ send_bits(s, dcodes-1, 5);
+ send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */
+ for (rank = 0; rank < blcodes; rank++) {
+ Tracev((stderr, "\nbl code %2d ", bl_order[rank]));
+ send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
+ }
+ Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent));
+
+ send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
+ Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent));
+
+ send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
+ Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent));
+}
+
+/* ===========================================================================
+ * Send a stored block
+ */
+void _tr_stored_block(s, buf, stored_len, eof)
+ deflate_state *s;
+ charf *buf; /* input block */
+ ulg stored_len; /* length of input block */
+ int eof; /* true if this is the last block for a file */
+{
+ send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */
+#ifdef DEBUG
+ s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
+ s->compressed_len += (stored_len + 4) << 3;
+#endif
+ copy_block(s, buf, (unsigned)stored_len, 1); /* with header */
+}
+
+/* ===========================================================================
+ * Send one empty static block to give enough lookahead for inflate.
+ * This takes 10 bits, of which 7 may remain in the bit buffer.
+ * The current inflate code requires 9 bits of lookahead. If the
+ * last two codes for the previous block (real code plus EOB) were coded
+ * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode
+ * the last real code. In this case we send two empty static blocks instead
+ * of one. (There are no problems if the previous block is stored or fixed.)
+ * To simplify the code, we assume the worst case of last real code encoded
+ * on one bit only.
+ */
+void _tr_align(s)
+ deflate_state *s;
+{
+ send_bits(s, STATIC_TREES<<1, 3);
+ send_code(s, END_BLOCK, static_ltree);
+#ifdef DEBUG
+ s->compressed_len += 10L; /* 3 for block type, 7 for EOB */
+#endif
+ bi_flush(s);
+ /* Of the 10 bits for the empty block, we have already sent
+ * (10 - bi_valid) bits. The lookahead for the last real code (before
+ * the EOB of the previous block) was thus at least one plus the length
+ * of the EOB plus what we have just sent of the empty static block.
+ */
+ if (1 + s->last_eob_len + 10 - s->bi_valid < 9) {
+ send_bits(s, STATIC_TREES<<1, 3);
+ send_code(s, END_BLOCK, static_ltree);
+#ifdef DEBUG
+ s->compressed_len += 10L;
+#endif
+ bi_flush(s);
+ }
+ s->last_eob_len = 7;
+}
+
+/* ===========================================================================
+ * Determine the best encoding for the current block: dynamic trees, static
+ * trees or store, and output the encoded block to the zip file.
+ */
+void _tr_flush_block(s, buf, stored_len, eof)
+ deflate_state *s;
+ charf *buf; /* input block, or NULL if too old */
+ ulg stored_len; /* length of input block */
+ int eof; /* true if this is the last block for a file */
+{
+ ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
+ int max_blindex = 0; /* index of last bit length code of non zero freq */
+
+ /* Build the Huffman trees unless a stored block is forced */
+ if (s->level > 0) {
+
+ /* Check if the file is binary or text */
+ if (stored_len > 0 && s->strm->data_type == Z_UNKNOWN)
+ set_data_type(s);
+
+ /* Construct the literal and distance trees */
+ build_tree(s, (tree_desc *)(&(s->l_desc)));
+ Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len,
+ s->static_len));
+
+ build_tree(s, (tree_desc *)(&(s->d_desc)));
+ Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len,
+ s->static_len));
+ /* At this point, opt_len and static_len are the total bit lengths of
+ * the compressed block data, excluding the tree representations.
+ */
+
+ /* Build the bit length tree for the above two trees, and get the index
+ * in bl_order of the last bit length code to send.
+ */
+ max_blindex = build_bl_tree(s);
+
+ /* Determine the best encoding. Compute the block lengths in bytes. */
+ opt_lenb = (s->opt_len+3+7)>>3;
+ static_lenb = (s->static_len+3+7)>>3;
+
+ Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
+ opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
+ s->last_lit));
+
+ if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
+
+ } else {
+ Assert(buf != (char*)0, "lost buf");
+ opt_lenb = static_lenb = stored_len + 5; /* force a stored block */
+ }
+
+#ifdef FORCE_STORED
+ if (buf != (char*)0) { /* force stored block */
+#else
+ if (stored_len+4 <= opt_lenb && buf != (char*)0) {
+ /* 4: two words for the lengths */
+#endif
+ /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
+ * Otherwise we can't have processed more than WSIZE input bytes since
+ * the last block flush, because compression would have been
+ * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
+ * transform a block into a stored block.
+ */
+ _tr_stored_block(s, buf, stored_len, eof);
+
+#ifdef FORCE_STATIC
+ } else if (static_lenb >= 0) { /* force static trees */
+#else
+ } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
+#endif
+ send_bits(s, (STATIC_TREES<<1)+eof, 3);
+ compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree);
+#ifdef DEBUG
+ s->compressed_len += 3 + s->static_len;
+#endif
+ } else {
+ send_bits(s, (DYN_TREES<<1)+eof, 3);
+ send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
+ max_blindex+1);
+ compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree);
+#ifdef DEBUG
+ s->compressed_len += 3 + s->opt_len;
+#endif
+ }
+ Assert (s->compressed_len == s->bits_sent, "bad compressed size");
+ /* The above check is made mod 2^32, for files larger than 512 MB
+ * and uLong implemented on 32 bits.
+ */
+ init_block(s);
+
+ if (eof) {
+ bi_windup(s);
+#ifdef DEBUG
+ s->compressed_len += 7; /* align on byte boundary */
+#endif
+ }
+ Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3,
+ s->compressed_len-7*eof));
+}
+
+/* ===========================================================================
+ * Save the match info and tally the frequency counts. Return true if
+ * the current block must be flushed.
+ */
+int _tr_tally (s, dist, lc)
+ deflate_state *s;
+ unsigned dist; /* distance of matched string */
+ unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */
+{
+ s->d_buf[s->last_lit] = (ush)dist;
+ s->l_buf[s->last_lit++] = (uch)lc;
+ if (dist == 0) {
+ /* lc is the unmatched char */
+ s->dyn_ltree[lc].Freq++;
+ } else {
+ s->matches++;
+ /* Here, lc is the match length - MIN_MATCH */
+ dist--; /* dist = match distance - 1 */
+ Assert((ush)dist < (ush)MAX_DIST(s) &&
+ (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
+ (ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match");
+
+ s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
+ s->dyn_dtree[d_code(dist)].Freq++;
+ }
+
+#ifdef TRUNCATE_BLOCK
+ /* Try to guess if it is profitable to stop the current block here */
+ if ((s->last_lit & 0x1fff) == 0 && s->level > 2) {
+ /* Compute an upper bound for the compressed length */
+ ulg out_length = (ulg)s->last_lit*8L;
+ ulg in_length = (ulg)((long)s->strstart - s->block_start);
+ int dcode;
+ for (dcode = 0; dcode < D_CODES; dcode++) {
+ out_length += (ulg)s->dyn_dtree[dcode].Freq *
+ (5L+extra_dbits[dcode]);
+ }
+ out_length >>= 3;
+ Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ",
+ s->last_lit, in_length, out_length,
+ 100L - out_length*100L/in_length));
+ if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1;
+ }
+#endif
+ return (s->last_lit == s->lit_bufsize-1);
+ /* We avoid equality with lit_bufsize because of wraparound at 64K
+ * on 16 bit machines and because stored blocks are restricted to
+ * 64K-1 bytes.
+ */
+}
+
+/* ===========================================================================
+ * Send the block data compressed using the given Huffman trees
+ */
+local void compress_block(s, ltree, dtree)
+ deflate_state *s;
+ ct_data *ltree; /* literal tree */
+ ct_data *dtree; /* distance tree */
+{
+ unsigned dist; /* distance of matched string */
+ int lc; /* match length or unmatched char (if dist == 0) */
+ unsigned lx = 0; /* running index in l_buf */
+ unsigned code; /* the code to send */
+ int extra; /* number of extra bits to send */
+
+ if (s->last_lit != 0) do {
+ dist = s->d_buf[lx];
+ lc = s->l_buf[lx++];
+ if (dist == 0) {
+ send_code(s, lc, ltree); /* send a literal byte */
+ Tracecv(isgraph(lc), (stderr," '%c' ", lc));
+ } else {
+ /* Here, lc is the match length - MIN_MATCH */
+ code = _length_code[lc];
+ send_code(s, code+LITERALS+1, ltree); /* send the length code */
+ extra = extra_lbits[code];
+ if (extra != 0) {
+ lc -= base_length[code];
+ send_bits(s, lc, extra); /* send the extra length bits */
+ }
+ dist--; /* dist is now the match distance - 1 */
+ code = d_code(dist);
+ Assert (code < D_CODES, "bad d_code");
+
+ send_code(s, code, dtree); /* send the distance code */
+ extra = extra_dbits[code];
+ if (extra != 0) {
+ dist -= base_dist[code];
+ send_bits(s, dist, extra); /* send the extra distance bits */
+ }
+ } /* literal or match pair ? */
+
+ /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */
+ Assert((uInt)(s->pending) < s->lit_bufsize + 2*lx,
+ "pendingBuf overflow");
+
+ } while (lx < s->last_lit);
+
+ send_code(s, END_BLOCK, ltree);
+ s->last_eob_len = ltree[END_BLOCK].Len;
+}
+
+/* ===========================================================================
+ * Set the data type to BINARY or TEXT, using a crude approximation:
+ * set it to Z_TEXT if all symbols are either printable characters (33 to 255)
+ * or white spaces (9 to 13, or 32); or set it to Z_BINARY otherwise.
+ * IN assertion: the fields Freq of dyn_ltree are set.
+ */
+local void set_data_type(s)
+ deflate_state *s;
+{
+ int n;
+
+ for (n = 0; n < 9; n++)
+ if (s->dyn_ltree[n].Freq != 0)
+ break;
+ if (n == 9)
+ for (n = 14; n < 32; n++)
+ if (s->dyn_ltree[n].Freq != 0)
+ break;
+ s->strm->data_type = (n == 32) ? Z_TEXT : Z_BINARY;
+}
+
+/* ===========================================================================
+ * Reverse the first len bits of a code, using straightforward code (a faster
+ * method would use a table)
+ * IN assertion: 1 <= len <= 15
+ */
+local unsigned bi_reverse(code, len)
+ unsigned code; /* the value to invert */
+ int len; /* its bit length */
+{
+ register unsigned res = 0;
+ do {
+ res |= code & 1;
+ code >>= 1, res <<= 1;
+ } while (--len > 0);
+ return res >> 1;
+}
+
+/* ===========================================================================
+ * Flush the bit buffer, keeping at most 7 bits in it.
+ */
+local void bi_flush(s)
+ deflate_state *s;
+{
+ if (s->bi_valid == 16) {
+ put_short(s, s->bi_buf);
+ s->bi_buf = 0;
+ s->bi_valid = 0;
+ } else if (s->bi_valid >= 8) {
+ put_byte(s, (Byte)s->bi_buf);
+ s->bi_buf >>= 8;
+ s->bi_valid -= 8;
+ }
+}
+
+/* ===========================================================================
+ * Flush the bit buffer and align the output on a byte boundary
+ */
+local void bi_windup(s)
+ deflate_state *s;
+{
+ if (s->bi_valid > 8) {
+ put_short(s, s->bi_buf);
+ } else if (s->bi_valid > 0) {
+ put_byte(s, (Byte)s->bi_buf);
+ }
+ s->bi_buf = 0;
+ s->bi_valid = 0;
+#ifdef DEBUG
+ s->bits_sent = (s->bits_sent+7) & ~7;
+#endif
+}
+
+/* ===========================================================================
+ * Copy a stored block, storing first the length and its
+ * one's complement if requested.
+ */
+local void copy_block(s, buf, len, header)
+ deflate_state *s;
+ charf *buf; /* the input data */
+ unsigned len; /* its length */
+ int header; /* true if block header must be written */
+{
+ bi_windup(s); /* align on byte boundary */
+ s->last_eob_len = 8; /* enough lookahead for inflate */
+
+ if (header) {
+ put_short(s, (ush)len);
+ put_short(s, (ush)~len);
+#ifdef DEBUG
+ s->bits_sent += 2*16;
+#endif
+ }
+#ifdef DEBUG
+ s->bits_sent += (ulg)len<<3;
+#endif
+ while (len--) {
+ put_byte(s, *buf++);
+ }
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zconf.h b/sys/contrib/opensolaris/uts/common/zmod/zconf.h
new file mode 100644
index 000000000000..ccce7b2742da
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/zconf.h
@@ -0,0 +1,117 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZCONF_H
+#define _ZCONF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * We don't want to turn on zlib's debugging.
+ */
+#undef DEBUG
+
+/*
+ * We define our own memory allocation and deallocation routines that use kmem.
+ */
+#define MY_ZCALLOC
+
+/*
+ * We don't define HAVE_MEMCPY here, but do in zutil.c, and implement our
+ * our versions of zmemcpy(), zmemzero(), and zmemcmp().
+ */
+
+/*
+ * We have a sufficiently capable compiler as to not need zlib's compiler hack.
+ */
+#define NO_DUMMY_DECL
+
+#define compressBound(len) (len + (len >> 12) + (len >> 14) + 11)
+
+#define z_off_t off_t
+#define OF(p) p
+#define ZEXTERN extern
+#define ZEXPORT
+#define ZEXPORTVA
+#define FAR
+
+#define deflateInit_ z_deflateInit_
+#define deflate z_deflate
+#define deflateEnd z_deflateEnd
+#define inflateInit_ z_inflateInit_
+#define inflate z_inflate
+#define inflateEnd z_inflateEnd
+#define deflateInit2_ z_deflateInit2_
+#define deflateSetDictionary z_deflateSetDictionary
+#define deflateCopy z_deflateCopy
+#define deflateReset z_deflateReset
+#define deflateParams z_deflateParams
+#define deflateBound z_deflateBound
+#define deflatePrime z_deflatePrime
+#define inflateInit2_ z_inflateInit2_
+#define inflateSetDictionary z_inflateSetDictionary
+#define inflateSync z_inflateSync
+#define inflateSyncPoint z_inflateSyncPoint
+#define inflateCopy z_inflateCopy
+#define inflateReset z_inflateReset
+#define inflateBack z_inflateBack
+#define inflateBackEnd z_inflateBackEnd
+#define compress zz_compress
+#define compress2 zz_compress2
+#define uncompress zz_uncompress
+#define adler32 z_adler32
+#define crc32 z_crc32
+#define get_crc_table z_get_crc_table
+#define zError z_zError
+
+#define MAX_MEM_LEVEL 9
+#define MAX_WBITS 15
+
+typedef unsigned char Byte;
+typedef unsigned int uInt;
+typedef unsigned long uLong;
+typedef Byte Bytef;
+typedef char charf;
+typedef int intf;
+typedef uInt uIntf;
+typedef uLong uLongf;
+typedef void *voidpc;
+typedef void *voidpf;
+typedef void *voidp;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZCONF_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zlib.h b/sys/contrib/opensolaris/uts/common/zmod/zlib.h
new file mode 100644
index 000000000000..9b971a0f5722
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/zlib.h
@@ -0,0 +1,1359 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.3, July 18th, 2005
+
+ Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup@gzip.org madler@alumni.caltech.edu
+
+
+ The data format used by the zlib library is described by RFCs (Request for
+ Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+ (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+#ifndef _ZLIB_H
+#define _ZLIB_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.2.3"
+#define ZLIB_VERNUM 0x1230
+
+/*
+ The 'zlib' compression library provides in-memory compression and
+ decompression functions, including integrity checks of the uncompressed
+ data. This version of the library supports only one compression method
+ (deflation) but other algorithms will be added later and will have the same
+ stream interface.
+
+ Compression can be done in a single step if the buffers are large
+ enough (for example if an input file is mmap'ed), or can be done by
+ repeated calls of the compression function. In the latter case, the
+ application must provide more input and/or consume the output
+ (providing more output space) before each call.
+
+ The compressed data format used by default by the in-memory functions is
+ the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+ around a deflate stream, which is itself documented in RFC 1951.
+
+ The library also supports reading and writing files in gzip (.gz) format
+ with an interface similar to that of stdio using the functions that start
+ with "gz". The gzip format is different from the zlib format. gzip is a
+ gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+ This library can optionally read and write gzip streams in memory as well.
+
+ The zlib format was designed to be compact and fast for use in memory
+ and on communications channels. The gzip format was designed for single-
+ file compression on file systems, has a larger header than zlib to maintain
+ directory information, and uses a different, slower check method than zlib.
+
+ The library does not install any signal handler. The decoder checks
+ the consistency of the compressed data, so the library should never
+ crash even in case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void (*free_func) OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+ Bytef *next_in; /* next input byte */
+ uInt avail_in; /* number of bytes available at next_in */
+ uLong total_in; /* total nb of input bytes read so far */
+
+ Bytef *next_out; /* next output byte should be put there */
+ uInt avail_out; /* remaining free space at next_out */
+ uLong total_out; /* total nb of bytes output so far */
+
+ char *msg; /* last error message, NULL if no error */
+ struct internal_state FAR *state; /* not visible by applications */
+
+ alloc_func zalloc; /* used to allocate the internal state */
+ free_func zfree; /* used to free the internal state */
+ voidpf opaque; /* private data object passed to zalloc and zfree */
+
+ int data_type; /* best guess about the data type: binary or text */
+ uLong adler; /* adler32 value of the uncompressed data */
+ uLong reserved; /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+ gzip header information passed to and from zlib routines. See RFC 1952
+ for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+ int text; /* true if compressed data believed to be text */
+ uLong time; /* modification time */
+ int xflags; /* extra flags (not used when writing a gzip file) */
+ int os; /* operating system */
+ Bytef *extra; /* pointer to extra field or Z_NULL if none */
+ uInt extra_len; /* extra field length (valid if extra != Z_NULL) */
+ uInt extra_max; /* space at extra (only when reading header) */
+ Bytef *name; /* pointer to zero-terminated file name or Z_NULL */
+ uInt name_max; /* space at name (only when reading header) */
+ Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */
+ uInt comm_max; /* space at comment (only when reading header) */
+ int hcrc; /* true if there was or will be a header crc */
+ int done; /* true when done reading gzip header (not used
+ when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+ The application must update next_in and avail_in when avail_in has
+ dropped to zero. It must update next_out and avail_out when avail_out
+ has dropped to zero. The application must initialize zalloc, zfree and
+ opaque before calling the init function. All other fields are set by the
+ compression library and must not be updated by the application.
+
+ The opaque value provided by the application will be passed as the first
+ parameter for calls of zalloc and zfree. This can be useful for custom
+ memory management. The compression library attaches no meaning to the
+ opaque value.
+
+ zalloc must return Z_NULL if there is not enough memory for the object.
+ If zlib is used in a multi-threaded application, zalloc and zfree must be
+ thread safe.
+
+ On 16-bit systems, the functions zalloc and zfree must be able to allocate
+ exactly 65536 bytes, but will not be required to allocate more than this
+ if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
+ pointers returned by zalloc for objects of exactly 65536 bytes *must*
+ have their offset normalized to zero. The default allocation function
+ provided by this library ensures this (see zutil.c). To reduce memory
+ requirements and avoid any allocation of 64K objects, at the expense of
+ compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
+
+ The fields total_in and total_out can be used for statistics or
+ progress reports. After compression, total_in holds the total size of
+ the uncompressed data and may be saved for use in the decompressor
+ (particularly if the decompressor wants to decompress everything in
+ a single step).
+*/
+
+ /* constants */
+
+#define Z_NO_FLUSH 0
+#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
+#define Z_SYNC_FLUSH 2
+#define Z_FULL_FLUSH 3
+#define Z_FINISH 4
+#define Z_BLOCK 5
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK 0
+#define Z_STREAM_END 1
+#define Z_NEED_DICT 2
+#define Z_ERRNO (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR (-3)
+#define Z_MEM_ERROR (-4)
+#define Z_BUF_ERROR (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative
+ * values are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION 0
+#define Z_BEST_SPEED 1
+#define Z_BEST_COMPRESSION 9
+#define Z_DEFAULT_COMPRESSION (-1)
+/* compression levels */
+
+#define Z_FILTERED 1
+#define Z_HUFFMAN_ONLY 2
+#define Z_RLE 3
+#define Z_FIXED 4
+#define Z_DEFAULT_STRATEGY 0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY 0
+#define Z_TEXT 1
+#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN 2
+/* Possible values of the data_type field (though see inflate()) */
+
+#define Z_DEFLATED 8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+ /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+ If the first character differs, the library code actually used is
+ not compatible with the zlib.h header file used by the application.
+ This check is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+
+ Initializes the internal stream state for compression. The fields
+ zalloc, zfree and opaque must be initialized before by the caller.
+ If zalloc and zfree are set to Z_NULL, deflateInit updates them to
+ use default allocation functions.
+
+ The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+ 1 gives best speed, 9 gives best compression, 0 gives no compression at
+ all (the input data is simply copied a block at a time).
+ Z_DEFAULT_COMPRESSION requests a default compromise between speed and
+ compression (currently equivalent to level 6).
+
+ deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if level is not a valid compression level,
+ Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+ with the version assumed by the caller (ZLIB_VERSION).
+ msg is set to null if there is no error message. deflateInit does not
+ perform any compression: this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+/*
+ deflate compresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce some
+ output latency (reading input without producing any output) except when
+ forced to flush.
+
+ The detailed semantics are as follows. deflate performs one or both of the
+ following actions:
+
+ - Compress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in and avail_in are updated and
+ processing will resume at this point for the next call of deflate().
+
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. This action is forced if the parameter flush is non zero.
+ Forcing flush frequently degrades the compression ratio, so this parameter
+ should be set only when necessary (in interactive applications).
+ Some output may be provided even if flush is not set.
+
+ Before the call of deflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming
+ more output, and updating avail_in or avail_out accordingly; avail_out
+ should never be zero before the call. The application can consume the
+ compressed output when it wants, for example when the output buffer is full
+ (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
+ and with zero avail_out, it must be called again after making room in the
+ output buffer because there might be more output pending.
+
+ Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+ decide how much data to accumualte before producing output, in order to
+ maximize compression.
+
+ If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+ flushed to the output buffer and the output is aligned on a byte boundary, so
+ that the decompressor can get all input data available so far. (In particular
+ avail_in is zero after the call if enough output space has been provided
+ before the call.) Flushing may degrade compression for some compression
+ algorithms and so it should be used only when necessary.
+
+ If flush is set to Z_FULL_FLUSH, all output is flushed as with
+ Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+ restart from this point if previous compressed data has been damaged or if
+ random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+ compression.
+
+ If deflate returns with avail_out == 0, this function must be called again
+ with the same value of the flush parameter and more output space (updated
+ avail_out), until the flush is complete (deflate returns with non-zero
+ avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+ avail_out is greater than six to avoid repeated flush markers due to
+ avail_out == 0 on return.
+
+ If the parameter flush is set to Z_FINISH, pending input is processed,
+ pending output is flushed and deflate returns with Z_STREAM_END if there
+ was enough output space; if deflate returns with Z_OK, this function must be
+ called again with Z_FINISH and more output space (updated avail_out) but no
+ more input data, until it returns with Z_STREAM_END or an error. After
+ deflate has returned Z_STREAM_END, the only possible operations on the
+ stream are deflateReset or deflateEnd.
+
+ Z_FINISH can be used immediately after deflateInit if all the compression
+ is to be done in a single step. In this case, avail_out must be at least
+ the value returned by deflateBound (see below). If deflate does not return
+ Z_STREAM_END, then it must be called again as described above.
+
+ deflate() sets strm->adler to the adler32 checksum of all input read
+ so far (that is, total_in bytes).
+
+ deflate() may update strm->data_type if it can make a good guess about
+ the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
+ binary. This field is only for information purposes and does not affect
+ the compression algorithm in any manner.
+
+ deflate() returns Z_OK if some progress has been made (more input
+ processed or more output produced), Z_STREAM_END if all input has been
+ consumed and all output has been produced (only when flush is set to
+ Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+ if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
+ (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
+ fatal, and deflate() can be called again with more input and more output
+ space to continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+/*
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any
+ pending output.
+
+ deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+ stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+ prematurely (some input or output was discarded). In the error case,
+ msg may be set but then points to a static string (which must not be
+ deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+
+ Initializes the internal stream state for decompression. The fields
+ next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+ the caller. If next_in is not Z_NULL and avail_in is large enough (the exact
+ value depends on the compression method), inflateInit determines the
+ compression method from the zlib header and allocates all data structures
+ accordingly; otherwise the allocation will be deferred to the first call of
+ inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to
+ use default allocation functions.
+
+ inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+ version assumed by the caller. msg is set to null if there is no error
+ message. inflateInit does not perform any decompression apart from reading
+ the zlib header if present: this will be done by inflate(). (So next_in and
+ avail_in may be modified, but next_out and avail_out are unchanged.)
+*/
+
+
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+/*
+ inflate decompresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce
+ some output latency (reading input without producing any output) except when
+ forced to flush.
+
+ The detailed semantics are as follows. inflate performs one or both of the
+ following actions:
+
+ - Decompress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in is updated and processing
+ will resume at this point for the next call of inflate().
+
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. inflate() provides as much output as possible, until there
+ is no more input data or no more space in the output buffer (see below
+ about the flush parameter).
+
+ Before the call of inflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming
+ more output, and updating the next_* and avail_* values accordingly.
+ The application can consume the uncompressed output when it wants, for
+ example when the output buffer is full (avail_out == 0), or after each
+ call of inflate(). If inflate returns Z_OK and with zero avail_out, it
+ must be called again after making room in the output buffer because there
+ might be more output pending.
+
+ The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
+ Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
+ output as possible to the output buffer. Z_BLOCK requests that inflate() stop
+ if and when it gets to the next deflate block boundary. When decoding the
+ zlib or gzip format, this will cause inflate() to return immediately after
+ the header and before the first block. When doing a raw inflate, inflate()
+ will go ahead and process the first block, and will return when it gets to
+ the end of that block, or when it runs out of data.
+
+ The Z_BLOCK option assists in appending to or combining deflate streams.
+ Also to assist in this, on return inflate() will set strm->data_type to the
+ number of unused bits in the last byte taken from strm->next_in, plus 64
+ if inflate() is currently decoding the last block in the deflate stream,
+ plus 128 if inflate() returned immediately after decoding an end-of-block
+ code or decoding the complete header up to just before the first byte of the
+ deflate stream. The end-of-block will not be indicated until all of the
+ uncompressed data from that block has been written to strm->next_out. The
+ number of unused bits may in general be greater than seven, except when
+ bit 7 of data_type is set, in which case the number of unused bits will be
+ less than eight.
+
+ inflate() should normally be called until it returns Z_STREAM_END or an
+ error. However if all decompression is to be performed in a single step
+ (a single call of inflate), the parameter flush should be set to
+ Z_FINISH. In this case all pending input is processed and all pending
+ output is flushed; avail_out must be large enough to hold all the
+ uncompressed data. (The size of the uncompressed data may have been saved
+ by the compressor for this purpose.) The next operation on this stream must
+ be inflateEnd to deallocate the decompression state. The use of Z_FINISH
+ is never required, but can be used to inform inflate that a faster approach
+ may be used for the single inflate() call.
+
+ In this implementation, inflate() always flushes as much output as
+ possible to the output buffer, and always uses the faster approach on the
+ first call. So the only effect of the flush parameter in this implementation
+ is on the return value of inflate(), as noted below, or when it returns early
+ because Z_BLOCK is used.
+
+ If a preset dictionary is needed after this call (see inflateSetDictionary
+ below), inflate sets strm->adler to the adler32 checksum of the dictionary
+ chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+ strm->adler to the adler32 checksum of all output produced so far (that is,
+ total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+ below. At the end of the stream, inflate() checks that its computed adler32
+ checksum is equal to that saved by the compressor and returns Z_STREAM_END
+ only if the checksum is correct.
+
+ inflate() will decompress and check either zlib-wrapped or gzip-wrapped
+ deflate data. The header type is detected automatically. Any information
+ contained in the gzip header is not retained, so applications that need that
+ information should instead use raw inflate, see inflateInit2() below, or
+ inflateBack() and perform their own processing of the gzip header and
+ trailer.
+
+ inflate() returns Z_OK if some progress has been made (more input processed
+ or more output produced), Z_STREAM_END if the end of the compressed data has
+ been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+ preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+ corrupted (input stream not conforming to the zlib format or incorrect check
+ value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+ if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+ Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+ output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+ inflate() can be called again with more input and more output space to
+ continue decompressing. If Z_DATA_ERROR is returned, the application may then
+ call inflateSync() to look for a good compression block if a partial recovery
+ of the data is desired.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+/*
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any
+ pending output.
+
+ inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+ was inconsistent. In the error case, msg may be set but then points to a
+ static string (which must not be deallocated).
+*/
+
+ /* Advanced functions */
+
+/*
+ The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+ int level,
+ int method,
+ int windowBits,
+ int memLevel,
+ int strategy));
+
+ This is another version of deflateInit with more compression options. The
+ fields next_in, zalloc, zfree and opaque must be initialized before by
+ the caller.
+
+ The method parameter is the compression method. It must be Z_DEFLATED in
+ this version of the library.
+
+ The windowBits parameter is the base two logarithm of the window size
+ (the size of the history buffer). It should be in the range 8..15 for this
+ version of the library. Larger values of this parameter result in better
+ compression at the expense of memory usage. The default value is 15 if
+ deflateInit is used instead.
+
+ windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
+ determines the window size. deflate() will then generate raw deflate data
+ with no zlib header or trailer, and will not compute an adler32 check value.
+
+ windowBits can also be greater than 15 for optional gzip encoding. Add
+ 16 to windowBits to write a simple gzip header and trailer around the
+ compressed data instead of a zlib wrapper. The gzip header will have no
+ file name, no extra data, no comment, no modification time (set to zero),
+ no header crc, and the operating system will be set to 255 (unknown). If a
+ gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+
+ The memLevel parameter specifies how much memory should be allocated
+ for the internal compression state. memLevel=1 uses minimum memory but
+ is slow and reduces compression ratio; memLevel=9 uses maximum memory
+ for optimal speed. The default value is 8. See zconf.h for total memory
+ usage as a function of windowBits and memLevel.
+
+ The strategy parameter is used to tune the compression algorithm. Use the
+ value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+ filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+ string match), or Z_RLE to limit match distances to one (run-length
+ encoding). Filtered data consists mostly of small values with a somewhat
+ random distribution. In this case, the compression algorithm is tuned to
+ compress them better. The effect of Z_FILTERED is to force more Huffman
+ coding and less string matching; it is somewhat intermediate between
+ Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as
+ Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy
+ parameter only affects the compression ratio but not the correctness of the
+ compressed output even if it is not set appropriately. Z_FIXED prevents the
+ use of dynamic Huffman codes, allowing for a simpler decoder for special
+ applications.
+
+ deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
+ method). msg is set to null if there is no error message. deflateInit2 does
+ not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+/*
+ Initializes the compression dictionary from the given byte sequence
+ without producing any compressed output. This function must be called
+ immediately after deflateInit, deflateInit2 or deflateReset, before any
+ call of deflate. The compressor and decompressor must use exactly the same
+ dictionary (see inflateSetDictionary).
+
+ The dictionary should consist of strings (byte sequences) that are likely
+ to be encountered later in the data to be compressed, with the most commonly
+ used strings preferably put towards the end of the dictionary. Using a
+ dictionary is most useful when the data to be compressed is short and can be
+ predicted with good accuracy; the data can then be compressed better than
+ with the default empty dictionary.
+
+ Depending on the size of the compression data structures selected by
+ deflateInit or deflateInit2, a part of the dictionary may in effect be
+ discarded, for example if the dictionary is larger than the window size in
+ deflate or deflate2. Thus the strings most likely to be useful should be
+ put at the end of the dictionary, not at the front. In addition, the
+ current implementation of deflate will use at most the window size minus
+ 262 bytes of the provided dictionary.
+
+ Upon return of this function, strm->adler is set to the adler32 value
+ of the dictionary; the decompressor may later use this value to determine
+ which dictionary has been used by the compressor. (The adler32 value
+ applies to the whole dictionary even if only a subset of the dictionary is
+ actually used by the compressor.) If a raw deflate was requested, then the
+ adler32 value is not computed and strm->adler is not set.
+
+ deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+ parameter is invalid (such as NULL dictionary) or the stream state is
+ inconsistent (for example if deflate has already been called for this stream
+ or if the compression method is bsort). deflateSetDictionary does not
+ perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+ z_streamp source));
+/*
+ Sets the destination stream as a complete copy of the source stream.
+
+ This function can be useful when several compression strategies will be
+ tried, for example when there are several ways of pre-processing the input
+ data with a filter. The streams that will be discarded should then be freed
+ by calling deflateEnd. Note that deflateCopy duplicates the internal
+ compression state which can be quite large, so this strategy is slow and
+ can consume lots of memory.
+
+ deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being NULL). msg is left unchanged in both source and
+ destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+/*
+ This function is equivalent to deflateEnd followed by deflateInit,
+ but does not free and reallocate all the internal compression state.
+ The stream will keep the same compression level and any other attributes
+ that may have been set by deflateInit2.
+
+ deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+ int level,
+ int strategy));
+/*
+ Dynamically update the compression level and compression strategy. The
+ interpretation of level and strategy is as in deflateInit2. This can be
+ used to switch between compression and straight copy of the input data, or
+ to switch to a different kind of input data requiring a different
+ strategy. If the compression level is changed, the input available so far
+ is compressed with the old level (and may be flushed); the new level will
+ take effect only at the next call of deflate().
+
+ Before the call of deflateParams, the stream state must be set as for
+ a call of deflate(), since the currently available input may have to
+ be compressed and flushed. In particular, strm->avail_out must be non-zero.
+
+ deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+ stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
+ if strm->avail_out was zero.
+*/
+
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+ int good_length,
+ int max_lazy,
+ int nice_length,
+ int max_chain));
+/*
+ Fine tune deflate's internal compression parameters. This should only be
+ used by someone who understands the algorithm used by zlib's deflate for
+ searching for the best matching string, and even then only by the most
+ fanatic optimizer trying to squeeze out the last compressed bit for their
+ specific input data. Read the deflate.c source code for the meaning of the
+ max_lazy, good_length, nice_length, and max_chain parameters.
+
+ deflateTune() can be called after deflateInit() or deflateInit2(), and
+ returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+ uLong sourceLen));
+/*
+ deflateBound() returns an upper bound on the compressed size after
+ deflation of sourceLen bytes. It must be called after deflateInit()
+ or deflateInit2(). This would be used to allocate an output buffer
+ for deflation in a single pass, and so would be called before deflate().
+*/
+
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+/*
+ deflatePrime() inserts bits in the deflate output stream. The intent
+ is that this function is used to start off the deflate output with the
+ bits leftover from a previous deflate stream when appending to it. As such,
+ this function can only be used for raw deflate, and must be used before the
+ first deflate() call after a deflateInit2() or deflateReset(). bits must be
+ less than or equal to 16, and that many of the least significant bits of
+ value will be inserted in the output.
+
+ deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+ gz_headerp head));
+/*
+ deflateSetHeader() provides gzip header information for when a gzip
+ stream is requested by deflateInit2(). deflateSetHeader() may be called
+ after deflateInit2() or deflateReset() and before the first call of
+ deflate(). The text, time, os, extra field, name, and comment information
+ in the provided gz_header structure are written to the gzip header (xflag is
+ ignored -- the extra flags are set according to the compression level). The
+ caller must assure that, if not Z_NULL, name and comment are terminated with
+ a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+ available there. If hcrc is true, a gzip header crc is included. Note that
+ the current versions of the command-line version of gzip (up through version
+ 1.3.x) do not support header crc's, and will report that it is a "multi-part
+ gzip file" and give up.
+
+ If deflateSetHeader is not used, the default gzip header has text false,
+ the time set to zero, and os set to 255, with no extra, name, or comment
+ fields. The gzip header is returned to the default state by deflateReset().
+
+ deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+ int windowBits));
+
+ This is another version of inflateInit with an extra parameter. The
+ fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+ before by the caller.
+
+ The windowBits parameter is the base two logarithm of the maximum window
+ size (the size of the history buffer). It should be in the range 8..15 for
+ this version of the library. The default value is 15 if inflateInit is used
+ instead. windowBits must be greater than or equal to the windowBits value
+ provided to deflateInit2() while compressing, or it must be equal to 15 if
+ deflateInit2() was not used. If a compressed stream with a larger window
+ size is given as input, inflate() will return with the error code
+ Z_DATA_ERROR instead of trying to allocate a larger window.
+
+ windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+ determines the window size. inflate() will then process raw deflate data,
+ not looking for a zlib or gzip header, not generating a check value, and not
+ looking for any check values for comparison at the end of the stream. This
+ is for use with other formats that use the deflate compressed data format
+ such as zip. Those formats provide their own check values. If a custom
+ format is developed using the raw deflate format for compressed data, it is
+ recommended that a check value such as an adler32 or a crc32 be applied to
+ the uncompressed data as is done in the zlib, gzip, and zip formats. For
+ most applications, the zlib format should be used as is. Note that comments
+ above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+ windowBits can also be greater than 15 for optional gzip decoding. Add
+ 32 to windowBits to enable zlib and gzip decoding with automatic header
+ detection, or add 16 to decode only the gzip format (the zlib format will
+ return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is
+ a crc32 instead of an adler32.
+
+ inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
+ is set to null if there is no error message. inflateInit2 does not perform
+ any decompression apart from reading the zlib header if present: this will
+ be done by inflate(). (So next_in and avail_in may be modified, but next_out
+ and avail_out are unchanged.)
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+/*
+ Initializes the decompression dictionary from the given uncompressed byte
+ sequence. This function must be called immediately after a call of inflate,
+ if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+ can be determined from the adler32 value returned by that call of inflate.
+ The compressor and decompressor must use exactly the same dictionary (see
+ deflateSetDictionary). For raw inflate, this function can be called
+ immediately after inflateInit2() or inflateReset() and before any call of
+ inflate() to set the dictionary. The application must insure that the
+ dictionary that was used for compression is provided.
+
+ inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+ parameter is invalid (such as NULL dictionary) or the stream state is
+ inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+ expected one (incorrect adler32 value). inflateSetDictionary does not
+ perform any decompression: this will be done by subsequent calls of
+ inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+ Skips invalid compressed data until a full flush point (see above the
+ description of deflate with Z_FULL_FLUSH) can be found, or until all
+ available input is skipped. No output is provided.
+
+ inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR
+ if no more input was provided, Z_DATA_ERROR if no flush point has been found,
+ or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
+ case, the application may save the current current value of total_in which
+ indicates where valid compressed data was found. In the error case, the
+ application may repeatedly call inflateSync, providing more input each time,
+ until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+ z_streamp source));
+/*
+ Sets the destination stream as a complete copy of the source stream.
+
+ This function can be useful when randomly accessing a large stream. The
+ first pass through the stream can periodically record the inflate state,
+ allowing restarting inflate at those points when randomly accessing the
+ stream.
+
+ inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being NULL). msg is left unchanged in both source and
+ destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+/*
+ This function is equivalent to inflateEnd followed by inflateInit,
+ but does not free and reallocate all the internal decompression state.
+ The stream will keep attributes that may have been set by inflateInit2.
+
+ inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+/*
+ This function inserts bits in the inflate input stream. The intent is
+ that this function is used to start inflating at a bit position in the
+ middle of a byte. The provided bits will be used before any bytes are used
+ from next_in. This function should only be used with raw inflate, and
+ should be used before the first inflate() call after inflateInit2() or
+ inflateReset(). bits must be less than or equal to 16, and that many of the
+ least significant bits of value will be inserted in the input.
+
+ inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+ gz_headerp head));
+/*
+ inflateGetHeader() requests that gzip header information be stored in the
+ provided gz_header structure. inflateGetHeader() may be called after
+ inflateInit2() or inflateReset(), and before the first call of inflate().
+ As inflate() processes the gzip stream, head->done is zero until the header
+ is completed, at which time head->done is set to one. If a zlib stream is
+ being decoded, then head->done is set to -1 to indicate that there will be
+ no gzip header information forthcoming. Note that Z_BLOCK can be used to
+ force inflate() to return immediately after header processing is complete
+ and before any actual data is decompressed.
+
+ The text, time, xflags, and os fields are filled in with the gzip header
+ contents. hcrc is set to true if there is a header CRC. (The header CRC
+ was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+ contains the maximum number of bytes to write to extra. Once done is true,
+ extra_len contains the actual extra field length, and extra contains the
+ extra field, or that field truncated if extra_max is less than extra_len.
+ If name is not Z_NULL, then up to name_max characters are written there,
+ terminated with a zero unless the length is greater than name_max. If
+ comment is not Z_NULL, then up to comm_max characters are written there,
+ terminated with a zero unless the length is greater than comm_max. When
+ any of extra, name, or comment are not Z_NULL and the respective field is
+ not present in the header, then that field is set to Z_NULL to signal its
+ absence. This allows the use of deflateSetHeader() with the returned
+ structure to duplicate the header. However if those fields are set to
+ allocated memory, then the application will need to save those pointers
+ elsewhere so that they can be eventually freed.
+
+ If inflateGetHeader is not used, then the header information is simply
+ discarded. The header is always checked for validity, including the header
+ CRC if present. inflateReset() will reset the process to discard the header
+ information. The application would need to call inflateGetHeader() again to
+ retrieve the header from the next gzip stream.
+
+ inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window));
+
+ Initialize the internal stream state for decompression using inflateBack()
+ calls. The fields zalloc, zfree and opaque in strm must be initialized
+ before the call. If zalloc and zfree are Z_NULL, then the default library-
+ derived memory allocation routines are used. windowBits is the base two
+ logarithm of the window size, in the range 8..15. window is a caller
+ supplied buffer of that size. Except for special applications where it is
+ assured that deflate was used with small window sizes, windowBits must be 15
+ and a 32K byte window must be supplied to be able to decompress general
+ deflate streams.
+
+ See inflateBack() for the usage of these routines.
+
+ inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+ the paramaters are invalid, Z_MEM_ERROR if the internal state could not
+ be allocated, or Z_VERSION_ERROR if the version of the library does not
+ match the version of the header file.
+*/
+
+typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+ in_func in, void FAR *in_desc,
+ out_func out, void FAR *out_desc));
+/*
+ inflateBack() does a raw inflate with a single call using a call-back
+ interface for input and output. This is more efficient than inflate() for
+ file i/o applications in that it avoids copying between the output and the
+ sliding window by simply making the window itself the output buffer. This
+ function trusts the application to not change the output buffer passed by
+ the output function, at least until inflateBack() returns.
+
+ inflateBackInit() must be called first to allocate the internal state
+ and to initialize the state with the user-provided window buffer.
+ inflateBack() may then be used multiple times to inflate a complete, raw
+ deflate stream with each call. inflateBackEnd() is then called to free
+ the allocated state.
+
+ A raw deflate stream is one with no zlib or gzip header or trailer.
+ This routine would normally be used in a utility that reads zip or gzip
+ files and writes out uncompressed files. The utility would decode the
+ header and process the trailer on its own, hence this routine expects
+ only the raw deflate stream to decompress. This is different from the
+ normal behavior of inflate(), which expects either a zlib or gzip header and
+ trailer around the deflate stream.
+
+ inflateBack() uses two subroutines supplied by the caller that are then
+ called by inflateBack() for input and output. inflateBack() calls those
+ routines until it reads a complete deflate stream and writes out all of the
+ uncompressed data, or until it encounters an error. The function's
+ parameters and return types are defined above in the in_func and out_func
+ typedefs. inflateBack() will call in(in_desc, &buf) which should return the
+ number of bytes of provided input, and a pointer to that input in buf. If
+ there is no input available, in() must return zero--buf is ignored in that
+ case--and inflateBack() will return a buffer error. inflateBack() will call
+ out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out()
+ should return zero on success, or non-zero on failure. If out() returns
+ non-zero, inflateBack() will return with an error. Neither in() nor out()
+ are permitted to change the contents of the window provided to
+ inflateBackInit(), which is also the buffer that out() uses to write from.
+ The length written by out() will be at most the window size. Any non-zero
+ amount of input may be provided by in().
+
+ For convenience, inflateBack() can be provided input on the first call by
+ setting strm->next_in and strm->avail_in. If that input is exhausted, then
+ in() will be called. Therefore strm->next_in must be initialized before
+ calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called
+ immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in
+ must also be initialized, and then if strm->avail_in is not zero, input will
+ initially be taken from strm->next_in[0 .. strm->avail_in - 1].
+
+ The in_desc and out_desc parameters of inflateBack() is passed as the
+ first parameter of in() and out() respectively when they are called. These
+ descriptors can be optionally used to pass any information that the caller-
+ supplied in() and out() functions need to do their job.
+
+ On return, inflateBack() will set strm->next_in and strm->avail_in to
+ pass back any unused input that was provided by the last in() call. The
+ return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+ if in() or out() returned an error, Z_DATA_ERROR if there was a format
+ error in the deflate stream (in which case strm->msg is set to indicate the
+ nature of the error), or Z_STREAM_ERROR if the stream was not properly
+ initialized. In the case of Z_BUF_ERROR, an input or output error can be
+ distinguished using strm->next_in which will be Z_NULL only if in() returned
+ an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to
+ out() returning non-zero. (in() will always be called before out(), so
+ strm->next_in is assured to be defined if out() returns non-zero.) Note
+ that inflateBack() cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+/*
+ All memory allocated by inflateBackInit() is freed.
+
+ inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+ state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+
+ Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+ 1.0: size of uInt
+ 3.2: size of uLong
+ 5.4: size of voidpf (pointer)
+ 7.6: size of z_off_t
+
+ Compiler, assembler, and debug options:
+ 8: DEBUG
+ 9: ASMV or ASMINF -- use ASM code
+ 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+ 11: 0 (reserved)
+
+ One-time table building (smaller code, but not thread-safe if true):
+ 12: BUILDFIXED -- build static block decoding tables when needed
+ 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+ 14,15: 0 (reserved)
+
+ Library content (indicates missing functionality):
+ 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+ deflate code when not needed)
+ 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+ and decode gzip streams (to avoid linking crc code)
+ 18-19: 0 (reserved)
+
+ Operation variations (changes in library functionality):
+ 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+ 21: FASTEST -- deflate algorithm with only one, lowest compression level
+ 22,23: 0 (reserved)
+
+ The sprintf variant used by gzprintf (zero is best):
+ 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+ 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+ 26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+ Remainder:
+ 27-31: 0 (reserved)
+ */
+
+
+ /* utility functions */
+
+/*
+ The following utility functions are implemented on top of the
+ basic stream-oriented functions. To simplify the interface, some
+ default options are assumed (compression level and memory usage,
+ standard memory allocation functions). The source code of these
+ utility functions can easily be modified if you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+/*
+ Compresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total
+ size of the destination buffer, which must be at least the value returned
+ by compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+ This function can be used to compress a whole file at once if the
+ input file is mmap'ed.
+ compress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen,
+ int level));
+/*
+ Compresses the source buffer into the destination buffer. The level
+ parameter has the same meaning as in deflateInit. sourceLen is the byte
+ length of the source buffer. Upon entry, destLen is the total size of the
+ destination buffer, which must be at least the value returned by
+ compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+
+ compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+/*
+ compressBound() returns an upper bound on the compressed size after
+ compress() or compress2() on sourceLen bytes. It would be used before
+ a compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+/*
+ Decompresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total
+ size of the destination buffer, which must be large enough to hold the
+ entire uncompressed data. (The size of the uncompressed data must have
+ been saved previously by the compressor and transmitted to the decompressor
+ by some mechanism outside the scope of this compression library.)
+ Upon exit, destLen is the actual size of the compressed buffer.
+ This function can be used to decompress a whole file at once if the
+ input file is mmap'ed.
+
+ uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.
+*/
+
+
+typedef voidp gzFile;
+
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+/*
+ Opens a gzip (.gz) file for reading or writing. The mode parameter
+ is as in fopen ("rb" or "wb") but can also include a compression level
+ ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for
+ Huffman only compression as in "wb1h", or 'R' for run-length encoding
+ as in "wb1R". (See the description of deflateInit2 for more information
+ about the strategy parameter.)
+
+ gzopen can be used to read a file which is not in gzip format; in this
+ case gzread will directly read from the file without decompression.
+
+ gzopen returns NULL if the file could not be opened or if there was
+ insufficient memory to allocate the (de)compression state; errno
+ can be checked to distinguish the two cases (if errno is zero, the
+ zlib error is Z_MEM_ERROR). */
+
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+/*
+ gzdopen() associates a gzFile with the file descriptor fd. File
+ descriptors are obtained from calls like open, dup, creat, pipe or
+ fileno (in the file has been previously opened with fopen).
+ The mode parameter is as in gzopen.
+ The next call of gzclose on the returned gzFile will also close the
+ file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
+ descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
+ gzdopen returns NULL if there was insufficient memory to allocate
+ the (de)compression state.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+/*
+ Dynamically update the compression level or strategy. See the description
+ of deflateInit2 for the meaning of these parameters.
+ gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
+ opened for writing.
+*/
+
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+/*
+ Reads the given number of uncompressed bytes from the compressed file.
+ If the input file was not in gzip format, gzread copies the given number
+ of bytes into the buffer.
+ gzread returns the number of uncompressed bytes actually read (0 for
+ end of file, -1 for error). */
+
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+ voidpc buf, unsigned len));
+/*
+ Writes the given number of uncompressed bytes into the compressed file.
+ gzwrite returns the number of uncompressed bytes actually written
+ (0 in case of error).
+*/
+
+ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...));
+/*
+ Converts, formats, and writes the args to the compressed file under
+ control of the format string, as in fprintf. gzprintf returns the number of
+ uncompressed bytes actually written (0 in case of error). The number of
+ uncompressed bytes written is limited to 4095. The caller should assure that
+ this limit is not exceeded. If it is exceeded, then gzprintf() will return
+ return an error (0) with nothing written. In this case, there may also be a
+ buffer overflow with unpredictable consequences, which is possible only if
+ zlib was compiled with the insecure functions sprintf() or vsprintf()
+ because the secure snprintf() or vsnprintf() functions were not available.
+*/
+
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+/*
+ Writes the given null-terminated string to the compressed file, excluding
+ the terminating null character.
+ gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+/*
+ Reads bytes from the compressed file until len-1 characters are read, or
+ a newline character is read and transferred to buf, or an end-of-file
+ condition is encountered. The string is then terminated with a null
+ character.
+ gzgets returns buf, or Z_NULL in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+/*
+ Writes c, converted to an unsigned char, into the compressed file.
+ gzputc returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+/*
+ Reads one byte from the compressed file. gzgetc returns this byte
+ or -1 in case of end of file or error.
+*/
+
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+/*
+ Push one character back onto the stream to be read again later.
+ Only one character of push-back is allowed. gzungetc() returns the
+ character pushed, or -1 on failure. gzungetc() will fail if a
+ character has been pushed but not read yet, or if c is -1. The pushed
+ character will be discarded if the stream is repositioned with gzseek()
+ or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+/*
+ Flushes all pending output into the compressed file. The parameter
+ flush is as in the deflate() function. The return value is the zlib
+ error number (see function gzerror below). gzflush returns Z_OK if
+ the flush parameter is Z_FINISH and all output could be flushed.
+ gzflush should be called only when strictly necessary because it can
+ degrade compression.
+*/
+
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+ z_off_t offset, int whence));
+/*
+ Sets the starting position for the next gzread or gzwrite on the
+ given compressed file. The offset represents a number of bytes in the
+ uncompressed data stream. The whence parameter is defined as in lseek(2);
+ the value SEEK_END is not supported.
+ If the file is opened for reading, this function is emulated but can be
+ extremely slow. If the file is opened for writing, only forward seeks are
+ supported; gzseek then compresses a sequence of zeroes up to the new
+ starting position.
+
+ gzseek returns the resulting offset location as measured in bytes from
+ the beginning of the uncompressed stream, or -1 in case of error, in
+ particular if the file is opened for writing and the new starting position
+ would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT gzrewind OF((gzFile file));
+/*
+ Rewinds the given file. This function is supported only for reading.
+
+ gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+*/
+
+ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file));
+/*
+ Returns the starting position for the next gzread or gzwrite on the
+ given compressed file. This position represents a number of bytes in the
+ uncompressed data stream.
+
+ gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+/*
+ Returns 1 when EOF has previously been detected reading the given
+ input stream, otherwise zero.
+*/
+
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+/*
+ Returns 1 if file is being read directly without decompression, otherwise
+ zero.
+*/
+
+ZEXTERN int ZEXPORT gzclose OF((gzFile file));
+/*
+ Flushes all pending output if necessary, closes the compressed file
+ and deallocates all the (de)compression state. The return value is the zlib
+ error number (see function gzerror below).
+*/
+
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+/*
+ Returns the error message for the last error which occurred on the
+ given compressed file. errnum is set to zlib error number. If an
+ error occurred in the file system and not in the compression library,
+ errnum is set to Z_ERRNO and the application may consult errno
+ to get the exact error code.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+/*
+ Clears the error and end-of-file flags for file. This is analogous to the
+ clearerr() function in stdio. This is useful for continuing to read a gzip
+ file that is being written concurrently.
+*/
+
+ /* checksum functions */
+
+/*
+ These functions are not related to compression but are exported
+ anyway because they might be useful in applications using the
+ compression library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+/*
+ Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+ return the updated checksum. If buf is NULL, this function returns
+ the required initial value for the checksum.
+ An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+ much faster. Usage example:
+
+ uLong adler = adler32(0L, Z_NULL, 0);
+
+ while (read_buffer(buffer, length) != EOF) {
+ adler = adler32(adler, buffer, length);
+ }
+ if (adler != original_adler) error();
+*/
+
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+ z_off_t len2));
+/*
+ Combine two Adler-32 checksums into one. For two sequences of bytes, seq1
+ and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+ each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of
+ seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.
+*/
+
+ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
+/*
+ Update a running CRC-32 with the bytes buf[0..len-1] and return the
+ updated CRC-32. If buf is NULL, this function returns the required initial
+ value for the for the crc. Pre- and post-conditioning (one's complement) is
+ performed within this function so it shouldn't be done by the application.
+ Usage example:
+
+ uLong crc = crc32(0L, Z_NULL, 0);
+
+ while (read_buffer(buffer, length) != EOF) {
+ crc = crc32(crc, buffer, length);
+ }
+ if (crc != original_crc) error();
+*/
+
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+
+/*
+ Combine two CRC-32 check values into one. For two sequences of bytes,
+ seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+ calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32
+ check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+ len2.
+*/
+
+
+ /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method,
+ int windowBits, int memLevel,
+ int strategy, const char *version,
+ int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window,
+ const char *version,
+ int stream_size));
+#define deflateInit(strm, level) \
+ deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit(strm) \
+ inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+ deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+ (strategy), ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+ inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+#define inflateBackInit(strm, windowBits, window) \
+ inflateBackInit_((strm), (windowBits), (window), \
+ ZLIB_VERSION, sizeof(z_stream))
+
+
+#if !defined(_ZUTIL_H) && !defined(NO_DUMMY_DECL)
+ struct internal_state {int dummy;}; /* hack for buggy compilers */
+#endif
+
+ZEXTERN const char * ZEXPORT zError OF((int));
+ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z));
+ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZLIB_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zmod.c b/sys/contrib/opensolaris/uts/common/zmod/zmod.c
new file mode 100644
index 000000000000..2627239c4b47
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/zmod.c
@@ -0,0 +1,109 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/zmod.h>
+
+#include "zlib.h"
+
+/*
+ * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store
+ * the expected decompressed data size externally so it can be passed in.
+ * The resulting decompressed size is then returned through dstlen. This
+ * function return Z_OK on success, or another error code on failure.
+ */
+int
+z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
+{
+ z_stream zs;
+ int err;
+
+ bzero(&zs, sizeof (zs));
+ zs.next_in = (uchar_t *)src;
+ zs.avail_in = srclen;
+ zs.next_out = dst;
+ zs.avail_out = *dstlen;
+
+ if ((err = inflateInit(&zs)) != Z_OK)
+ return (err);
+
+ if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ (void) inflateEnd(&zs);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+
+ *dstlen = zs.total_out;
+ return (inflateEnd(&zs));
+}
+
+int
+z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
+ int level)
+{
+
+ z_stream zs;
+ int err;
+
+ bzero(&zs, sizeof (zs));
+ zs.next_in = (uchar_t *)src;
+ zs.avail_in = srclen;
+ zs.next_out = dst;
+ zs.avail_out = *dstlen;
+
+ if ((err = deflateInit(&zs, level)) != Z_OK)
+ return (err);
+
+ if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ (void) deflateEnd(&zs);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+
+ *dstlen = zs.total_out;
+ return (deflateEnd(&zs));
+}
+
+int
+z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen)
+{
+ return (z_compress_level(dst, dstlen, src, srclen,
+ Z_DEFAULT_COMPRESSION));
+}
+
+/*
+ * Convert a zlib error code into a string error message.
+ */
+const char *
+z_strerror(int err)
+{
+ int i = Z_NEED_DICT - err;
+
+ if (i < 0 || i > Z_NEED_DICT - Z_VERSION_ERROR)
+ return ("unknown error");
+
+ return (zError(err));
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c b/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c
new file mode 100644
index 000000000000..0542712c37f5
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+#include <sys/kobj.h>
+
+struct zchdr {
+ uint_t zch_magic;
+ uint_t zch_size;
+};
+
+#define ZCH_MAGIC 0x3cc13cc1
+
+/*ARGSUSED*/
+void *
+zcalloc(void *opaque, uint_t items, uint_t size)
+{
+ size_t nbytes = sizeof (struct zchdr) + items * size;
+ struct zchdr *z = kobj_zalloc(nbytes, KM_NOWAIT|KM_TMP);
+
+ if (z == NULL)
+ return (NULL);
+
+ z->zch_magic = ZCH_MAGIC;
+ z->zch_size = nbytes;
+
+ return (z + 1);
+}
+
+/*ARGSUSED*/
+void
+zcfree(void *opaque, void *ptr)
+{
+ struct zchdr *z = ((struct zchdr *)ptr) - 1;
+
+ if (z->zch_magic != ZCH_MAGIC)
+ panic("zcfree region corrupt: hdr=%p ptr=%p", (void *)z, ptr);
+
+ kobj_free(z, z->zch_size);
+}
+
+void
+zmemcpy(void *dest, const void *source, uint_t len)
+{
+ bcopy(source, dest, len);
+}
+
+int
+zmemcmp(const void *s1, const void *s2, uint_t len)
+{
+ return (bcmp(s1, s2, len));
+}
+
+void
+zmemzero(void *dest, uint_t len)
+{
+ bzero(dest, len);
+}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zutil.c b/sys/contrib/opensolaris/uts/common/zmod/zutil.c
new file mode 100644
index 000000000000..7d46e30b3edf
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/zutil.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* zutil.c -- target dependent utility functions for the compression library
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "zutil.h"
+
+#ifndef NO_DUMMY_DECL
+struct internal_state {int dummy;}; /* for buggy compilers */
+#endif
+
+const char * const z_errmsg[10] = {
+"need dictionary", /* Z_NEED_DICT 2 */
+"stream end", /* Z_STREAM_END 1 */
+"", /* Z_OK 0 */
+"file error", /* Z_ERRNO (-1) */
+"stream error", /* Z_STREAM_ERROR (-2) */
+"data error", /* Z_DATA_ERROR (-3) */
+"insufficient memory", /* Z_MEM_ERROR (-4) */
+"buffer error", /* Z_BUF_ERROR (-5) */
+"incompatible version",/* Z_VERSION_ERROR (-6) */
+""};
+
+
+const char * ZEXPORT zlibVersion()
+{
+ return ZLIB_VERSION;
+}
+
+uLong ZEXPORT zlibCompileFlags()
+{
+ uLong flags;
+
+ flags = 0;
+ switch (sizeof(uInt)) {
+ case 2: break;
+ case 4: flags += 1; break;
+ case 8: flags += 2; break;
+ default: flags += 3;
+ }
+ switch (sizeof(uLong)) {
+ case 2: break;
+ case 4: flags += 1 << 2; break;
+ case 8: flags += 2 << 2; break;
+ default: flags += 3 << 2;
+ }
+ switch (sizeof(voidpf)) {
+ case 2: break;
+ case 4: flags += 1 << 4; break;
+ case 8: flags += 2 << 4; break;
+ default: flags += 3 << 4;
+ }
+ switch (sizeof(z_off_t)) {
+ case 2: break;
+ case 4: flags += 1 << 6; break;
+ case 8: flags += 2 << 6; break;
+ default: flags += 3 << 6;
+ }
+#ifdef DEBUG
+ flags += 1 << 8;
+#endif
+#if defined(ASMV) || defined(ASMINF)
+ flags += 1 << 9;
+#endif
+#ifdef ZLIB_WINAPI
+ flags += 1 << 10;
+#endif
+#ifdef BUILDFIXED
+ flags += 1 << 12;
+#endif
+#ifdef DYNAMIC_CRC_TABLE
+ flags += 1 << 13;
+#endif
+#ifdef NO_GZCOMPRESS
+ flags += 1L << 16;
+#endif
+#ifdef NO_GZIP
+ flags += 1L << 17;
+#endif
+#ifdef PKZIP_BUG_WORKAROUND
+ flags += 1L << 20;
+#endif
+#ifdef FASTEST
+ flags += 1L << 21;
+#endif
+#ifdef STDC
+# ifdef NO_vsnprintf
+ flags += 1L << 25;
+# ifdef HAS_vsprintf_void
+ flags += 1L << 26;
+# endif
+# else
+# ifdef HAS_vsnprintf_void
+ flags += 1L << 26;
+# endif
+# endif
+#else
+ flags += 1L << 24;
+# ifdef NO_snprintf
+ flags += 1L << 25;
+# ifdef HAS_sprintf_void
+ flags += 1L << 26;
+# endif
+# else
+# ifdef HAS_snprintf_void
+ flags += 1L << 26;
+# endif
+# endif
+#endif
+ return flags;
+}
+
+#ifdef DEBUG
+
+# ifndef verbose
+# define verbose 0
+# endif
+int z_verbose = verbose;
+
+void z_error (m)
+ char *m;
+{
+ fprintf(stderr, "%s\n", m);
+ exit(1);
+}
+#endif
+
+/* exported to allow conversion of error code to string for compress() and
+ * uncompress()
+ */
+const char * ZEXPORT zError(err)
+ int err;
+{
+ return ERR_MSG(err);
+}
+
+#if defined(_WIN32_WCE)
+ /* The Microsoft C Run-Time Library for Windows CE doesn't have
+ * errno. We define it as a global variable to simplify porting.
+ * Its value is always 0 and should not be used.
+ */
+ int errno = 0;
+#endif
+
+#define HAVE_MEMCPY
+#ifndef HAVE_MEMCPY
+
+void zmemcpy(dest, source, len)
+ Bytef* dest;
+ const Bytef* source;
+ uInt len;
+{
+ if (len == 0) return;
+ do {
+ *dest++ = *source++; /* ??? to be unrolled */
+ } while (--len != 0);
+}
+
+int zmemcmp(s1, s2, len)
+ const Bytef* s1;
+ const Bytef* s2;
+ uInt len;
+{
+ uInt j;
+
+ for (j = 0; j < len; j++) {
+ if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1;
+ }
+ return 0;
+}
+
+void zmemzero(dest, len)
+ Bytef* dest;
+ uInt len;
+{
+ if (len == 0) return;
+ do {
+ *dest++ = 0; /* ??? to be unrolled */
+ } while (--len != 0);
+}
+#endif
+
+
+#ifdef SYS16BIT
+
+#ifdef __TURBOC__
+/* Turbo C in 16-bit mode */
+
+# define MY_ZCALLOC
+
+/* Turbo C malloc() does not allow dynamic allocation of 64K bytes
+ * and farmalloc(64K) returns a pointer with an offset of 8, so we
+ * must fix the pointer. Warning: the pointer must be put back to its
+ * original form in order to free it, use zcfree().
+ */
+
+#define MAX_PTR 10
+/* 10*64K = 640K */
+
+local int next_ptr = 0;
+
+typedef struct ptr_table_s {
+ voidpf org_ptr;
+ voidpf new_ptr;
+} ptr_table;
+
+local ptr_table table[MAX_PTR];
+/* This table is used to remember the original form of pointers
+ * to large buffers (64K). Such pointers are normalized with a zero offset.
+ * Since MSDOS is not a preemptive multitasking OS, this table is not
+ * protected from concurrent access. This hack doesn't work anyway on
+ * a protected system like OS/2. Use Microsoft C instead.
+ */
+
+voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
+{
+ voidpf buf = opaque; /* just to make some compilers happy */
+ ulg bsize = (ulg)items*size;
+
+ /* If we allocate less than 65520 bytes, we assume that farmalloc
+ * will return a usable pointer which doesn't have to be normalized.
+ */
+ if (bsize < 65520L) {
+ buf = farmalloc(bsize);
+ if (*(ush*)&buf != 0) return buf;
+ } else {
+ buf = farmalloc(bsize + 16L);
+ }
+ if (buf == NULL || next_ptr >= MAX_PTR) return NULL;
+ table[next_ptr].org_ptr = buf;
+
+ /* Normalize the pointer to seg:0 */
+ *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4;
+ *(ush*)&buf = 0;
+ table[next_ptr++].new_ptr = buf;
+ return buf;
+}
+
+void zcfree (voidpf opaque, voidpf ptr)
+{
+ int n;
+ if (*(ush*)&ptr != 0) { /* object < 64K */
+ farfree(ptr);
+ return;
+ }
+ /* Find the original pointer */
+ for (n = 0; n < next_ptr; n++) {
+ if (ptr != table[n].new_ptr) continue;
+
+ farfree(table[n].org_ptr);
+ while (++n < next_ptr) {
+ table[n-1] = table[n];
+ }
+ next_ptr--;
+ return;
+ }
+ ptr = opaque; /* just to make some compilers happy */
+ Assert(0, "zcfree: ptr not found");
+}
+
+#endif /* __TURBOC__ */
+
+
+#ifdef M_I86
+/* Microsoft C in 16-bit mode */
+
+# define MY_ZCALLOC
+
+#if (!defined(_MSC_VER) || (_MSC_VER <= 600))
+# define _halloc halloc
+# define _hfree hfree
+#endif
+
+voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
+{
+ if (opaque) opaque = 0; /* to make compiler happy */
+ return _halloc((long)items, size);
+}
+
+void zcfree (voidpf opaque, voidpf ptr)
+{
+ if (opaque) opaque = 0; /* to make compiler happy */
+ _hfree(ptr);
+}
+
+#endif /* M_I86 */
+
+#endif /* SYS16BIT */
+
+
+#ifndef MY_ZCALLOC /* Any system without a special alloc function */
+
+#ifndef STDC
+extern voidp malloc OF((uInt size));
+extern voidp calloc OF((uInt items, uInt size));
+extern void free OF((voidpf ptr));
+#endif
+
+voidpf zcalloc (opaque, items, size)
+ voidpf opaque;
+ unsigned items;
+ unsigned size;
+{
+ if (opaque) items += size - size; /* make compiler happy */
+ return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) :
+ (voidpf)calloc(items, size);
+}
+
+void zcfree (opaque, ptr)
+ voidpf opaque;
+ voidpf ptr;
+{
+ free(ptr);
+ if (opaque) return; /* make compiler happy */
+}
+
+#endif /* MY_ZCALLOC */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zutil.h b/sys/contrib/opensolaris/uts/common/zmod/zutil.h
new file mode 100644
index 000000000000..1d02c1d09783
--- /dev/null
+++ b/sys/contrib/opensolaris/uts/common/zmod/zutil.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* zutil.h -- internal interface and configuration of the compression library
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+ part of the implementation of the compression library and is
+ subject to change. Applications should only use zlib.h.
+ */
+
+#ifndef _ZUTIL_H
+#define _ZUTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#define ZLIB_INTERNAL
+#include "zlib.h"
+
+#ifdef STDC
+# ifndef _WIN32_WCE
+# include <stddef.h>
+# endif
+# include <string.h>
+# include <stdlib.h>
+#endif
+#ifdef NO_ERRNO_H
+# ifdef _WIN32_WCE
+ /* The Microsoft C Run-Time Library for Windows CE doesn't have
+ * errno. We define it as a global variable to simplify porting.
+ * Its value is always 0 and should not be used. We rename it to
+ * avoid conflict with other libraries that use the same workaround.
+ */
+# define errno z_errno
+# endif
+ extern int errno;
+#else
+# ifndef _WIN32_WCE
+# include <sys/errno.h>
+# endif
+#endif
+
+#ifndef local
+# define local static
+#endif
+/* compile with -Dlocal if your debugger can't find static symbols */
+
+typedef unsigned char uch;
+typedef uch FAR uchf;
+typedef unsigned short ush;
+typedef ush FAR ushf;
+typedef unsigned long ulg;
+
+extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
+/* (size given to avoid silly warnings with Visual C++) */
+
+#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
+
+#define ERR_RETURN(strm,err) \
+ return (strm->msg = (char*)ERR_MSG(err), (err))
+/* To be used only when the state is known to be valid */
+
+ /* common constants */
+
+#ifndef DEF_WBITS
+# define DEF_WBITS MAX_WBITS
+#endif
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+
+#if MAX_MEM_LEVEL >= 8
+# define DEF_MEM_LEVEL 8
+#else
+# define DEF_MEM_LEVEL MAX_MEM_LEVEL
+#endif
+/* default memLevel */
+
+#define STORED_BLOCK 0
+#define STATIC_TREES 1
+#define DYN_TREES 2
+/* The three kinds of block type */
+
+#define MIN_MATCH 3
+#define MAX_MATCH 258
+/* The minimum and maximum match lengths */
+
+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
+
+ /* target dependencies */
+
+#if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32))
+# define OS_CODE 0x00
+# if defined(__TURBOC__) || defined(__BORLANDC__)
+# if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
+ /* Allow compilation with ANSI keywords only enabled */
+ void _Cdecl farfree( void *block );
+ void *_Cdecl farmalloc( unsigned long nbytes );
+# else
+# include <alloc.h>
+# endif
+# else /* MSC or DJGPP */
+# include <malloc.h>
+# endif
+#endif
+
+#ifdef AMIGA
+# define OS_CODE 0x01
+#endif
+
+#if defined(VAXC) || defined(VMS)
+# define OS_CODE 0x02
+# define F_OPEN(name, mode) \
+ fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512")
+#endif
+
+#if defined(ATARI) || defined(atarist)
+# define OS_CODE 0x05
+#endif
+
+#ifdef OS2
+# define OS_CODE 0x06
+# ifdef M_I86
+ #include <malloc.h>
+# endif
+#endif
+
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+# define OS_CODE 0x07
+# if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
+# include <unix.h> /* for fdopen */
+# else
+# ifndef fdopen
+# define fdopen(fd,mode) NULL /* No fdopen() */
+# endif
+# endif
+#endif
+
+#ifdef TOPS20
+# define OS_CODE 0x0a
+#endif
+
+#ifdef WIN32
+# ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */
+# define OS_CODE 0x0b
+# endif
+#endif
+
+#ifdef __50SERIES /* Prime/PRIMOS */
+# define OS_CODE 0x0f
+#endif
+
+#if defined(_BEOS_) || defined(RISCOS)
+# define fdopen(fd,mode) NULL /* No fdopen() */
+#endif
+
+#if (defined(_MSC_VER) && (_MSC_VER > 600))
+# if defined(_WIN32_WCE)
+# define fdopen(fd,mode) NULL /* No fdopen() */
+# ifndef _PTRDIFF_T_DEFINED
+ typedef int ptrdiff_t;
+# define _PTRDIFF_T_DEFINED
+# endif
+# else
+# define fdopen(fd,type) _fdopen(fd,type)
+# endif
+#endif
+
+ /* common defaults */
+
+#ifndef OS_CODE
+# define OS_CODE 0x03 /* assume Unix */
+#endif
+
+#ifndef F_OPEN
+# define F_OPEN(name, mode) fopen((name), (mode))
+#endif
+
+ /* functions */
+
+#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
+# ifndef HAVE_VSNPRINTF
+# define HAVE_VSNPRINTF
+# endif
+#endif
+#if defined(__CYGWIN__)
+# ifndef HAVE_VSNPRINTF
+# define HAVE_VSNPRINTF
+# endif
+#endif
+#ifndef HAVE_VSNPRINTF
+# ifdef MSDOS
+ /* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
+ but for now we just assume it doesn't. */
+# define NO_vsnprintf
+# endif
+# ifdef __TURBOC__
+# define NO_vsnprintf
+# endif
+# ifdef WIN32
+ /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+# if !defined(vsnprintf) && !defined(NO_vsnprintf)
+# define vsnprintf _vsnprintf
+# endif
+# endif
+# ifdef __SASC
+# define NO_vsnprintf
+# endif
+#endif
+#ifdef VMS
+# define NO_vsnprintf
+#endif
+
+#if defined(pyr)
+# define NO_MEMCPY
+#endif
+#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__)
+ /* Use our own functions for small and medium model with MSC <= 5.0.
+ * You may have to use the same strategy for Borland C (untested).
+ * The __SC__ check is for Symantec.
+ */
+# define NO_MEMCPY
+#endif
+#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY)
+# define HAVE_MEMCPY
+#endif
+#ifdef HAVE_MEMCPY
+# ifdef SMALL_MEDIUM /* MSDOS small or medium model */
+# define zmemcpy _fmemcpy
+# define zmemcmp _fmemcmp
+# define zmemzero(dest, len) _fmemset(dest, 0, len)
+# else
+# define zmemcpy memcpy
+# define zmemcmp memcmp
+# define zmemzero(dest, len) memset(dest, 0, len)
+# endif
+#else
+ extern void zmemcpy OF((void* dest, const void* source, uInt len));
+ extern int zmemcmp OF((const void* s1, const void* s2, uInt len));
+ extern void zmemzero OF((void* dest, uInt len));
+#endif
+
+/* Diagnostic functions */
+#ifdef DEBUG
+# include <stdio.h>
+ extern int z_verbose;
+ extern void z_error OF((char *m));
+# define Assert(cond,msg) {if(!(cond)) z_error(msg);}
+# define Trace(x) {if (z_verbose>=0) fprintf x ;}
+# define Tracev(x) {if (z_verbose>0) fprintf x ;}
+# define Tracevv(x) {if (z_verbose>1) fprintf x ;}
+# define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;}
+# define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;}
+#else
+# define Assert(cond,msg)
+# define Trace(x)
+# define Tracev(x)
+# define Tracevv(x)
+# define Tracec(c,x)
+# define Tracecv(c,x)
+#endif
+
+
+voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size));
+void zcfree OF((voidpf opaque, voidpf ptr));
+
+#define ZALLOC(strm, items, size) \
+ (*((strm)->zalloc))((strm)->opaque, (items), (size))
+#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
+#define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
+
+#endif /* _ZUTIL_H */