1 files changed, 1120 insertions, 0 deletions
diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c
new file mode 100644
index 000000000000..75f6e30e1500
--- /dev/null
+++ b/usr.sbin/makefs/zfs/fs.c
@@ -0,0 +1,1120 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <util.h>
+
+#include "makefs.h"
+#include "zfs.h"
+
+typedef struct {
+	const char	*name;
+	unsigned int	id;
+	uint16_t	size;
+	sa_bswap_type_t	bs;
+} zfs_sattr_t;
+
+typedef struct zfs_fs {
+	zfs_objset_t	*os;
+
+	/* Offset table for system attributes, indexed by a zpl_attr_t. */
+	uint16_t	*saoffs;
+	size_t		sacnt;
+	const zfs_sattr_t *satab;
+} zfs_fs_t;
+
+/*
+ * The order of the attributes doesn't matter, this is simply the one hard-coded
+ * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
+ */
+typedef enum zpl_attr {
+	ZPL_ATIME,
+	ZPL_MTIME,
+	ZPL_CTIME,
+	ZPL_CRTIME,
+	ZPL_GEN,
+	ZPL_MODE,
+	ZPL_SIZE,
+	ZPL_PARENT,
+	ZPL_LINKS,
+	ZPL_XATTR,
+	ZPL_RDEV,
+	ZPL_FLAGS,
+	ZPL_UID,
+	ZPL_GID,
+	ZPL_PAD,
+	ZPL_ZNODE_ACL,
+	ZPL_DACL_COUNT,
+	ZPL_SYMLINK,
+	ZPL_SCANSTAMP,
+	ZPL_DACL_ACES,
+	ZPL_DXATTR,
+	ZPL_PROJID,
+} zpl_attr_t;
+
+/*
+ * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
+ */
+static const zfs_sattr_t zpl_attrs[] = {
+#define	_ZPL_ATTR(n, s, b)	{ .name = #n, .id = n, .size = s, .bs = b }
+	_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
+	_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
+	_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
+	_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
+	_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
+#undef ZPL_ATTR
+};
+
+/*
+ * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
+ * It need not match in general, but FreeBSD's loader doesn't bother parsing the
+ * layout and just hard-codes attribute offsets.
+ */
+static const sa_attr_type_t zpl_attr_layout[] = {
+	ZPL_MODE,
+	ZPL_SIZE,
+	ZPL_GEN,
+	ZPL_UID,
+	ZPL_GID,
+	ZPL_PARENT,
+	ZPL_FLAGS,
+	ZPL_ATIME,
+	ZPL_MTIME,
+	ZPL_CTIME,
+	ZPL_CRTIME,
+	ZPL_LINKS,
+	ZPL_DACL_COUNT,
+	ZPL_DACL_ACES,
+	ZPL_SYMLINK,
+};
+
+/*
+ * Keys for the ZPL attribute tables in the SA layout ZAP.  The first two
+ * indices are reserved for legacy attribute encoding.
+ */
+#define	SA_LAYOUT_INDEX_DEFAULT	2
+#define	SA_LAYOUT_INDEX_SYMLINK	3
+
+struct fs_populate_dir {
+	SLIST_ENTRY(fs_populate_dir) next;
+	int			dirfd;
+	uint64_t		objid;
+	zfs_zap_t		*zap;
+};
+
+struct fs_populate_arg {
+	zfs_opt_t	*zfs;
+	zfs_fs_t	*fs;			/* owning filesystem */
+	uint64_t	rootdirid;		/* root directory dnode ID */
+	int		rootdirfd;		/* root directory fd */
+	SLIST_HEAD(, fs_populate_dir) dirs;	/* stack of directories */
+};
+
+static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
+
+static void
+eclose(int fd)
+{
+	if (close(fd) != 0)
+		err(1, "close");
+}
+
+static bool
+fsnode_isroot(const fsnode *cur)
+{
+	return (strcmp(cur->name, ".") == 0);
+}
+
+static bool
+fsnode_valid(const fsnode *cur)
+{
+	return (cur->type == S_IFREG || cur->type == S_IFDIR ||
+	    cur->type == S_IFLNK);
+}
+
+/*
+ * Visit each node in a directory hierarchy, in pre-order depth-first order.
+ */
+static void
+fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
+{
+	assert(root->type == S_IFDIR);
+
+	for (fsnode *cur = root; cur != NULL; cur = cur->next) {
+		if (!fsnode_valid(cur)) {
+			warnx("skipping unhandled %s %s/%s",
+			    inode_type(cur->type), cur->path, cur->name);
+			continue;
+		}
+		if (cb(cur, arg) == 0)
+			continue;
+		if (cur->type == S_IFDIR && cur->child != NULL)
+			fsnode_foreach(cur->child, cb, arg);
+	}
+}
+
+static void
+fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
+{
+	struct fs_populate_dir *dir;
+	uint64_t type;
+
+	switch (cur->type) {
+	case S_IFREG:
+		type = DT_REG;
+		break;
+	case S_IFDIR:
+		type = DT_DIR;
+		break;
+	case S_IFLNK:
+		type = DT_LNK;
+		break;
+	default:
+		assert(0);
+	}
+
+	dir = SLIST_FIRST(&arg->dirs);
+	zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
+}
+
+static void
+fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
+    size_t *szp)
+{
+	assert(ind < fs->sacnt);
+	assert(fs->saoffs[ind] != 0xffff);
+
+	memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
+	*szp += fs->satab[ind].size;
+}
+
+static void
+fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
+    size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
+{
+	assert(ind < fs->sacnt);
+	assert(fs->saoffs[ind] != 0xffff);
+	assert(fs->satab[ind].size == 0);
+
+	memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
+	*szp += valsz;
+}
+
+/*
+ * Derive the relative fd/path combo needed to access a file.  Ideally we'd
+ * always be able to use relative lookups (i.e., use the *at() system calls),
+ * since they require less path translation and are more amenable to sandboxing,
+ * but the handling of multiple staging directories makes that difficult.  To
+ * make matters worse, we have no choice but to use relative lookups when
+ * dealing with an mtree manifest, so both mechanisms are implemented.
+ */
+static void
+fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg,
+    char *path, size_t sz, int *dirfdp)
+{
+	if (cur->contents != NULL) {
+		size_t n;
+
+		*dirfdp = AT_FDCWD;
+		n = strlcpy(path, cur->contents, sz);
+		assert(n < sz);
+	} else if (cur->root == NULL) {
+		size_t n;
+
+		*dirfdp = SLIST_FIRST(&arg->dirs)->dirfd;
+		n = strlcpy(path, cur->name, sz);
+		assert(n < sz);
+	} else {
+		int n;
+
+		*dirfdp = AT_FDCWD;
+		n = snprintf(path, sz, "%s/%s/%s",
+		    cur->root, cur->path, cur->name);
+		assert(n >= 0);
+		assert((size_t)n < sz);
+	}
+}
+
+static int
+fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags)
+{
+	char path[PATH_MAX];
+	int fd;
+
+	fs_populate_path(cur, arg, path, sizeof(path), &fd);
+
+	fd = openat(fd, path, flags);
+	if (fd < 0)
+		err(1, "openat(%s)", path);
+	return (fd);
+}
+
+static int
+fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags)
+{
+	int fd;
+	char path[PATH_MAX];
+
+	fs_populate_path(cur, arg, path, sizeof(path), &fd);
+
+	return (openat(fd, path, flags));
+}
+
+static void
+fs_readlink(const fsnode *cur, struct fs_populate_arg *arg,
+    char *buf, size_t bufsz)
+{
+	char path[PATH_MAX];
+	int fd;
+
+	if (cur->symlink != NULL) {
+		size_t n;
+
+		n = strlcpy(buf, cur->symlink, bufsz);
+		assert(n < bufsz);
+	} else {
+		ssize_t n;
+
+		fs_populate_path(cur, arg, path, sizeof(path), &fd);
+
+		n = readlinkat(fd, path, buf, bufsz - 1);
+		if (n == -1)
+			err(1, "readlinkat(%s)", cur->name);
+		buf[n] = '\0';
+	}
+}
+
+static void
+fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts,
+    uint16_t ind, size_t *szp)
+{
+	uint64_t timebuf[2];
+
+	assert(ind < fs->sacnt);
+	assert(fs->saoffs[ind] != 0xffff);
+	assert(fs->satab[ind].size == sizeof(timebuf));
+
+	timebuf[0] = ts->tv_sec;
+	timebuf[1] = ts->tv_nsec;
+	fs_populate_attr(fs, attrbuf, timebuf, ind, szp);
+}
+
+static void
+fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
+    dnode_phys_t *dnode)
+{
+	char target[PATH_MAX];
+	zfs_fs_t *fs;
+	zfs_ace_hdr_t aces[3];
+	struct stat *sb;
+	sa_hdr_phys_t *sahdr;
+	uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
+	char *attrbuf;
+	size_t bonussz, hdrsz;
+	int layout;
+
+	assert(dnode->dn_bonustype == DMU_OT_SA);
+	assert(dnode->dn_nblkptr == 1);
+
+	fs = arg->fs;
+	sb = &cur->inode->st;
+
+	switch (cur->type) {
+	case S_IFREG:
+		layout = SA_LAYOUT_INDEX_DEFAULT;
+		links = cur->inode->nlink;
+		objsize = sb->st_size;
+		parent = SLIST_FIRST(&arg->dirs)->objid;
+		break;
+	case S_IFDIR:
+		layout = SA_LAYOUT_INDEX_DEFAULT;
+		links = 1; /* .. */
+		objsize = 1; /* .. */
+
+		if ((cur->inode->flags & FI_ROOT) == 0 ) {
+			/*
+			 * The size of a ZPL directory is the number of entries
+			 * (including "." and ".."), and the link count is the
+			 * number of entries which are directories
+			 * (including "." and "..").
+			 */
+			for (fsnode *c =
+			    fsnode_isroot(cur) ? cur->next : cur->child;
+			    c != NULL; c = c->next) {
+				switch (c->type) {
+				case S_IFDIR:
+					links++;
+					/* FALLTHROUGH */
+				case S_IFREG:
+				case S_IFLNK:
+					objsize++;
+					break;
+				}
+			}
+		} else {
+			/*
+			 * Root directory children do belong to
+			 * different dataset and this directory is
+			 * empty in the current objset.
+			 */
+			links++;        /* . */
+			objsize++;      /* . */
+		}
+
+		/* The root directory is its own parent. */
+		parent = SLIST_EMPTY(&arg->dirs) ?
+		    arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
+		break;
+	case S_IFLNK:
+		fs_readlink(cur, arg, target, sizeof(target));
+
+		layout = SA_LAYOUT_INDEX_SYMLINK;
+		links = 1;
+		objsize = strlen(target);
+		parent = SLIST_FIRST(&arg->dirs)->objid;
+		break;
+	default:
+		assert(0);
+	}
+
+	daclcount = nitems(aces);
+	flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE |
+	    ZFS_AV_MODIFIED;
+	gen = 1;
+	gid = sb->st_gid;
+	mode = sb->st_mode;
+	uid = sb->st_uid;
+
+	memset(aces, 0, sizeof(aces));
+	aces[0].z_flags = ACE_OWNER;
+	aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+	aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
+	    ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
+	    ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+	if ((mode & S_IRUSR) != 0)
+		aces[0].z_access_mask |= ACE_READ_DATA;
+	if ((mode & S_IWUSR) != 0)
+		aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
+	if ((mode & S_IXUSR) != 0)
+		aces[0].z_access_mask |= ACE_EXECUTE;
+
+	aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
+	aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+	aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
+	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+	if ((mode & S_IRGRP) != 0)
+		aces[1].z_access_mask |= ACE_READ_DATA;
+	if ((mode & S_IWGRP) != 0)
+		aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
+	if ((mode & S_IXGRP) != 0)
+		aces[1].z_access_mask |= ACE_EXECUTE;
+
+	aces[2].z_flags = ACE_EVERYONE;
+	aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+	aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
+	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+	if ((mode & S_IROTH) != 0)
+		aces[2].z_access_mask |= ACE_READ_DATA;
+	if ((mode & S_IWOTH) != 0)
+		aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
+	if ((mode & S_IXOTH) != 0)
+		aces[2].z_access_mask |= ACE_EXECUTE;
+
+	switch (layout) {
+	case SA_LAYOUT_INDEX_DEFAULT:
+		/* At most one variable-length attribute. */
+		hdrsz = sizeof(uint64_t);
+		break;
+	case SA_LAYOUT_INDEX_SYMLINK:
+		/* At most five variable-length attributes. */
+		hdrsz = sizeof(uint64_t) * 2;
+		break;
+	default:
+		assert(0);
+	}
+
+	sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
+	sahdr->sa_magic = SA_MAGIC;
+	SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
+
+	bonussz = SA_HDR_SIZE(sahdr);
+	attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
+
+	fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
+	fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
+	fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
+	fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
+	fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
+	fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
+	fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
+	fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
+	fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
+
+	/*
+	 * We deliberately set atime = mtime here to ensure that images are
+	 * reproducible.
+	 */
+	fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
+	fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
+	fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
+#ifdef __linux__
+	/* Linux has no st_birthtim; approximate with st_ctim */
+	fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz);
+#else
+	fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
+#endif
+
+	fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
+	    ZPL_DACL_ACES, &bonussz);
+	sahdr->sa_lengths[0] = sizeof(aces);
+
+	if (cur->type == S_IFLNK) {
+		assert(layout == SA_LAYOUT_INDEX_SYMLINK);
+		/* Need to use a spill block pointer if the target is long. */
+		assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
+		fs_populate_varszattr(fs, attrbuf, target, objsize,
+		    sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
+		sahdr->sa_lengths[1] = (uint16_t)objsize;
+	}
+
+	dnode->dn_bonuslen = bonussz;
+}
+
+static void
+fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
+{
+	struct dnode_cursor *c;
+	dnode_phys_t *dnode;
+	zfs_opt_t *zfs;
+	char *buf;
+	uint64_t dnid;
+	ssize_t n;
+	size_t bufsz;
+	off_t nbytes, reqbytes, size;
+	int fd;
+
+	assert(cur->type == S_IFREG);
+	assert((cur->inode->flags & FI_ROOT) == 0);
+
+	zfs = arg->zfs;
+
+	assert(cur->inode->ino != 0);
+	if ((cur->inode->flags & FI_ALLOCATED) != 0) {
+		/*
+		 * This is a hard link of an existing file.
+		 *
+		 * XXX-MJ need to check whether it crosses datasets, add a test
+		 * case for that
+		 */
+		fs_populate_dirent(arg, cur, cur->inode->ino);
+		return;
+	}
+
+	dnode = objset_dnode_bonus_alloc(arg->fs->os,
+	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
+	cur->inode->ino = dnid;
+	cur->inode->flags |= FI_ALLOCATED;
+
+	fd = fs_open(cur, arg, O_RDONLY);
+
+	buf = zfs->filebuf;
+	bufsz = sizeof(zfs->filebuf);
+	size = cur->inode->st.st_size;
+	c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
+	for (off_t foff = 0; foff < size; foff += nbytes) {
+		off_t loc, sofar;
+
+		/*
+		 * Fill up our buffer, handling partial reads.
+		 */
+		sofar = 0;
+		nbytes = MIN(size - foff, (off_t)bufsz);
+		do {
+			n = read(fd, buf + sofar, nbytes);
+			if (n < 0)
+				err(1, "reading from '%s'", cur->name);
+			if (n == 0)
+				errx(1, "unexpected EOF reading '%s'",
+				    cur->name);
+			sofar += n;
+		} while (sofar < nbytes);
+
+		if (nbytes < (off_t)bufsz)
+			memset(buf + nbytes, 0, bufsz - nbytes);
+
+		reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE;
+		loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes);
+		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc,
+		    dnode_cursor_next(zfs, c, foff));
+	}
+	eclose(fd);
+	dnode_cursor_finish(zfs, c);
+
+	fs_populate_sattrs(arg, cur, dnode);
+	fs_populate_dirent(arg, cur, dnid);
+}
+
+static void
+fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
+{
+	dnode_phys_t *dnode;
+	zfs_objset_t *os;
+	uint64_t dnid;
+	int dirfd;
+
+	assert(cur->type == S_IFDIR);
+	assert((cur->inode->flags & FI_ALLOCATED) == 0);
+
+	os = arg->fs->os;
+
+	dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
+	    DMU_OT_SA, 0, &dnid);
+
+	/*
+	 * Add an entry to the parent directory and open this directory.
+	 */
+	if (!SLIST_EMPTY(&arg->dirs)) {
+		fs_populate_dirent(arg, cur, dnid);
+		/*
+		 * We only need the directory fd if we're finding files in
+		 * it.  If it's just there for other directories or
+		 * files using contents= we don't need to succeed here.
+		 */
+		dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY);
+	} else {
+		arg->rootdirid = dnid;
+		dirfd = arg->rootdirfd;
+		arg->rootdirfd = -1;
+	}
+
+	/*
+	 * Set ZPL attributes.
+	 */
+	fs_populate_sattrs(arg, cur, dnode);
+
+	/*
+	 * If this is a root directory, then its children belong to a different
+	 * dataset and this directory remains empty in the current objset.
+	 */
+	if ((cur->inode->flags & FI_ROOT) == 0) {
+		struct fs_populate_dir *dir;
+
+		dir = ecalloc(1, sizeof(*dir));
+		dir->dirfd = dirfd;
+		dir->objid = dnid;
+		dir->zap = zap_alloc(os, dnode);
+		SLIST_INSERT_HEAD(&arg->dirs, dir, next);
+	} else {
+		zap_write(arg->zfs, zap_alloc(os, dnode));
+		fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
+	}
+}
+
+static void
+fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
+{
+	dnode_phys_t *dnode;
+	uint64_t dnid;
+
+	assert(cur->type == S_IFLNK);
+	assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
+
+	dnode = objset_dnode_bonus_alloc(arg->fs->os,
+	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
+
+	fs_populate_dirent(arg, cur, dnid);
+
+	fs_populate_sattrs(arg, cur, dnode);
+}
+
+static fsnode *
+fsnode_next(fsnode *cur)
+{
+	for (cur = cur->next; cur != NULL; cur = cur->next) {
+		if (fsnode_valid(cur))
+			return (cur);
+	}
+	return (NULL);
+}
+
+static int
+fs_foreach_populate(fsnode *cur, void *_arg)
+{
+	struct fs_populate_arg *arg;
+	struct fs_populate_dir *dir;
+	int ret;
+
+	arg = _arg;
+	switch (cur->type) {
+	case S_IFREG:
+		fs_populate_file(cur, arg);
+		break;
+	case S_IFDIR:
+		if (fsnode_isroot(cur))
+			break;
+		fs_populate_dir(cur, arg);
+		break;
+	case S_IFLNK:
+		fs_populate_symlink(cur, arg);
+		break;
+	default:
+		assert(0);
+	}
+
+	ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
+
+	if (fsnode_next(cur) == NULL &&
+	    (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
+		/*
+		 * We reached a terminal node in a subtree.  Walk back up and
+		 * write out directories.  We're done once we hit the root of a
+		 * dataset or find a level where we're not on the edge of the
+		 * tree.
+		 */
+		do {
+			dir = SLIST_FIRST(&arg->dirs);
+			SLIST_REMOVE_HEAD(&arg->dirs, next);
+			zap_write(arg->zfs, dir->zap);
+			if (dir->dirfd != -1)
+				eclose(dir->dirfd);
+			free(dir);
+			cur = cur->parent;
+		} while (cur != NULL && fsnode_next(cur) == NULL &&
+		    (cur->inode->flags & FI_ROOT) == 0);
+	}
+
+	return (ret);
+}
+
+static void
+fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
+    const sa_attr_type_t layout[], size_t sacnt)
+{
+	char ti[16];
+
+	assert(sizeof(layout[0]) == 2);
+
+	(void)snprintf(ti, sizeof(ti), "%u", index);
+	zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
+	    (const uint8_t *)layout);
+}
+
+/*
+ * Initialize system attribute tables.
+ *
+ * There are two elements to this.  First, we write the zpl_attrs[] and
+ * zpl_attr_layout[] tables to disk.  Then we create a lookup table which
+ * allows us to set file attributes quickly.
+ */
+static uint64_t
+fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
+{
+	zfs_zap_t *sazap, *salzap, *sarzap;
+	zfs_objset_t *os;
+	dnode_phys_t *saobj, *salobj, *sarobj;
+	uint64_t saobjid, salobjid, sarobjid;
+	uint16_t offset;
+
+	os = fs->os;
+
+	/*
+	 * The on-disk tables are stored in two ZAP objects, the registry object
+	 * and the layout object.  Individual attributes are described by
+	 * entries in the registry object; for example, the value for the
+	 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
+	 * The attributes of a file are ordered according to one of the layouts
+	 * defined in the layout object.  The master node object is simply used
+	 * to locate the registry and layout objects.
+	 */
+	saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
+	salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
+	sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
+
+	sarzap = zap_alloc(os, sarobj);
+	for (size_t i = 0; i < nitems(zpl_attrs); i++) {
+		const zfs_sattr_t *sa;
+		uint64_t attr;
+
+		attr = 0;
+		sa = &zpl_attrs[i];
+		SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
+		zap_add_uint64(sarzap, sa->name, attr);
+	}
+	zap_write(zfs, sarzap);
+
+	/*
+	 * Layouts are arrays of indices into the registry.  We define two
+	 * layouts for use by the ZPL, one for non-symlinks and one for
+	 * symlinks.  They are identical except that the symlink layout includes
+	 * ZPL_SYMLINK as its final attribute.
+	 */
+	salzap = zap_alloc(os, salobj);
+	assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
+	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
+	    zpl_attr_layout, nitems(zpl_attr_layout) - 1);
+	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
+	    zpl_attr_layout, nitems(zpl_attr_layout));
+	zap_write(zfs, salzap);
+
+	sazap = zap_alloc(os, saobj);
+	zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
+	zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
+	zap_write(zfs, sazap);
+
+	/* Sanity check. */
+	for (size_t i = 0; i < nitems(zpl_attrs); i++)
+		assert(i == zpl_attrs[i].id);
+
+	/*
+	 * Build the offset table used when setting file attributes.  File
+	 * attributes are stored in the object's bonus buffer; this table
+	 * provides the buffer offset of attributes referenced by the layout
+	 * table.
+	 */
+	fs->sacnt = nitems(zpl_attrs);
+	fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
+	for (size_t i = 0; i < fs->sacnt; i++)
+		fs->saoffs[i] = 0xffff;
+	offset = 0;
+	for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
+		uint16_t size;
+
+		assert(zpl_attr_layout[i] < fs->sacnt);
+
+		fs->saoffs[zpl_attr_layout[i]] = offset;
+		size = zpl_attrs[zpl_attr_layout[i]].size;
+		offset += size;
+	}
+	fs->satab = zpl_attrs;
+
+	return (saobjid);
+}
+
+static void
+fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
+{
+	char *mountpoint, *origmountpoint, *name, *next;
+	fsnode *cur, *root;
+	uint64_t canmount;
+
+	if (!dsl_dir_has_dataset(dsldir))
+		return;
+
+	if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
+		return;
+	mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
+	if (mountpoint == NULL)
+		return;
+
+	/*
+	 * If we were asked to specify a bootfs, set it here.
+	 */
+	if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
+	    dsl_dir_fullname(dsldir)) == 0) {
+		zap_add_uint64(zfs->poolprops, "bootfs",
+		    dsl_dir_dataset_id(dsldir));
+	}
+
+	origmountpoint = mountpoint;
+
+	/*
+	 * Figure out which fsnode corresponds to our mountpoint.
+	 */
+	root = arg;
+	cur = root;
+	if (strcmp(mountpoint, zfs->rootpath) != 0) {
+		mountpoint += strlen(zfs->rootpath);
+
+		/*
+		 * Look up the directory in the staged tree.  For example, if
+		 * the dataset's mount point is /foo/bar/baz, we'll search the
+		 * root directory for "foo", search "foo" for "baz", and so on.
+		 * Each intermediate name must refer to a directory; the final
+		 * component need not exist.
+		 */
+		cur = root;
+		for (next = name = mountpoint; next != NULL;) {
+			for (; *next == '/'; next++)
+				;
+			name = strsep(&next, "/");
+
+			for (; cur != NULL && strcmp(cur->name, name) != 0;
+			    cur = cur->next)
+				;
+			if (cur == NULL) {
+				if (next == NULL)
+					break;
+				errx(1, "missing mountpoint directory for `%s'",
+				    dsl_dir_fullname(dsldir));
+			}
+			if (cur->type != S_IFDIR) {
+				errx(1,
+				    "mountpoint for `%s' is not a directory",
+				    dsl_dir_fullname(dsldir));
+			}
+			if (next != NULL)
+				cur = cur->child;
+		}
+	}
+
+	if (cur != NULL) {
+		assert(cur->type == S_IFDIR);
+
+		/*
+		 * Multiple datasets shouldn't share a mountpoint.  It's
+		 * technically allowed, but it's not clear what makefs should do
+		 * in that case.
+		 */
+		assert((cur->inode->flags & FI_ROOT) == 0);
+		if (cur != root)
+			cur->inode->flags |= FI_ROOT;
+		assert(cur->inode->param == NULL);
+		cur->inode->param = dsldir;
+	}
+
+	free(origmountpoint);
+}
+
+static int
+fs_foreach_mark(fsnode *cur, void *arg)
+{
+	uint64_t *countp;
+
+	countp = arg;
+	if (cur->type == S_IFDIR && fsnode_isroot(cur))
+		return (1);
+
+	if (cur->inode->ino == 0) {
+		cur->inode->ino = ++(*countp);
+		cur->inode->nlink = 1;
+	} else {
+		cur->inode->nlink++;
+	}
+
+	return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
+}
+
+/*
+ * Create a filesystem dataset.  More specifically:
+ * - create an object set for the dataset,
+ * - add required metadata (SA tables, property definitions, etc.) to that
+ *   object set,
+ * - optionally populate the object set with file objects, using "root" as the
+ *   root directory.
+ *
+ * "dirfd" is a directory descriptor for the directory referenced by "root".  It
+ * is closed before returning.
+ */
+static void
+fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
+{
+	struct fs_populate_arg arg;
+	zfs_fs_t fs;
+	zfs_zap_t *masterzap;
+	zfs_objset_t *os;
+	dnode_phys_t *deleteq, *masterobj;
+	uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
+	bool fakedroot;
+
+	/*
+	 * This dataset's mountpoint doesn't exist in the staging tree, or the
+	 * dataset doesn't have a mountpoint at all.  In either case we still
+	 * need a root directory.  Fake up a root fsnode to handle this case.
+	 */
+	fakedroot = root == NULL;
+	if (fakedroot) {
+		struct stat *stp;
+
+		assert(dirfd == -1);
+
+		root = ecalloc(1, sizeof(*root));
+		root->inode = ecalloc(1, sizeof(*root->inode));
+		root->name = estrdup(".");
+		root->type = S_IFDIR;
+
+		stp = &root->inode->st;
+		stp->st_uid = 0;
+		stp->st_gid = 0;
+		stp->st_mode = S_IFDIR | 0755;
+	}
+	assert(root->type == S_IFDIR);
+	assert(fsnode_isroot(root));
+
+	/*
+	 * Initialize the object set for this dataset.
+	 */
+	os = objset_alloc(zfs, DMU_OST_ZFS);
+	masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
+	assert(moid == MASTER_NODE_OBJ);
+
+	memset(&fs, 0, sizeof(fs));
+	fs.os = os;
+
+	/*
+	 * Create the ZAP SA layout now since filesystem object dnodes will
+	 * refer to those attributes.
+	 */
+	saobjid = fs_set_zpl_attrs(zfs, &fs);
+
+	/*
+	 * Make a pass over the staged directory to detect hard links and assign
+	 * virtual dnode numbers.
+	 */
+	dnodecount = 1; /* root directory */
+	fsnode_foreach(root, fs_foreach_mark, &dnodecount);
+
+	/*
+	 * Make a second pass to populate the dataset with files from the
+	 * staged directory.  Most of our runtime is spent here.
+	 */
+	arg.rootdirfd = dirfd;
+	arg.zfs = zfs;
+	arg.fs = &fs;
+	SLIST_INIT(&arg.dirs);
+	fs_populate_dir(root, &arg);
+	assert(!SLIST_EMPTY(&arg.dirs));
+	fsnode_foreach(root, fs_foreach_populate, &arg);
+	assert(SLIST_EMPTY(&arg.dirs));
+	rootdirid = arg.rootdirid;
+
+	/*
+	 * Create an empty delete queue.  We don't do anything with it, but
+	 * OpenZFS will refuse to mount filesystems that don't have one.
+	 */
+	deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
+	zap_write(zfs, zap_alloc(os, deleteq));
+
+	/*
+	 * Populate and write the master node object.  This is a ZAP object
+	 * containing various dataset properties and the object IDs of the root
+	 * directory and delete queue.
+	 */
+	masterzap = zap_alloc(os, masterobj);
+	zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
+	zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
+	zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
+	zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
+	zap_add_uint64(masterzap, "normalization", 0 /* off */);
+	zap_add_uint64(masterzap, "utf8only", 0 /* off */);
+	zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
+	zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
+	zap_write(zfs, masterzap);
+
+	/*
+	 * All finished with this object set, we may as well write it now.
+	 * The DSL layer will sum up the bytes consumed by each dataset using
+	 * information stored in the object set, so it can't be freed just yet.
+	 */
+	dsl_dir_dataset_write(zfs, os, dsldir);
+
+	if (fakedroot) {
+		free(root->inode);
+		free(root->name);
+		free(root);
+	}
+	free(fs.saoffs);
+}
+
+/*
+ * Create an object set for each DSL directory which has a dataset and doesn't
+ * already have an object set.
+ */
+static void
+fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
+{
+	if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
+		fs_build_one(zfs, dsldir, NULL, -1);
+}
+
+/*
+ * Create our datasets and populate them with files.
+ */
+void
+fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
+{
+	/*
+	 * Run through our datasets and find the root fsnode for each one.  Each
+	 * root fsnode is flagged so that we can figure out which dataset it
+	 * belongs to.
+	 */
+	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
+
+	/*
+	 * Did we find our boot filesystem?
+	 */
+	if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
+		errx(1, "no mounted dataset matches bootfs property `%s'",
+		    zfs->bootfs);
+
+	/*
+	 * Traverse the file hierarchy starting from the root fsnode.  One
+	 * dataset, not necessarily the root dataset, must "own" the root
+	 * directory by having its mountpoint be equal to the root path.
+	 *
+	 * As roots of other datasets are encountered during the traversal,
+	 * fs_build_one() recursively creates the corresponding object sets and
+	 * populates them.  Once this function has returned, all datasets will
+	 * have been fully populated.
+	 */
+	fs_build_one(zfs, root->inode->param, root, dirfd);
+
+	/*
+	 * Now create object sets for datasets whose mountpoints weren't found
+	 * in the staging directory, either because there is no mountpoint, or
+	 * because the mountpoint doesn't correspond to an existing directory.
+	 */
+	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
+}