diff options
Diffstat (limited to 'usr.sbin/makefs/zfs/fs.c')
-rw-r--r-- | usr.sbin/makefs/zfs/fs.c | 1120 |
1 files changed, 1120 insertions, 0 deletions
diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c new file mode 100644 index 000000000000..75f6e30e1500 --- /dev/null +++ b/usr.sbin/makefs/zfs/fs.c @@ -0,0 +1,1120 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/stat.h> + +#include <assert.h> +#include <dirent.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <util.h> + +#include "makefs.h" +#include "zfs.h" + +typedef struct { + const char *name; + unsigned int id; + uint16_t size; + sa_bswap_type_t bs; +} zfs_sattr_t; + +typedef struct zfs_fs { + zfs_objset_t *os; + + /* Offset table for system attributes, indexed by a zpl_attr_t. */ + uint16_t *saoffs; + size_t sacnt; + const zfs_sattr_t *satab; +} zfs_fs_t; + +/* + * The order of the attributes doesn't matter, this is simply the one hard-coded + * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_DXATTR, + ZPL_PROJID, +} zpl_attr_t; + +/* + * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. + */ +static const zfs_sattr_t zpl_attrs[] = { +#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } + _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), + _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), +#undef ZPL_ATTR +}; + +/* + * This layout matches that of a filesystem created using OpenZFS on FreeBSD. + * It need not match in general, but FreeBSD's loader doesn't bother parsing the + * layout and just hard-codes attribute offsets. + */ +static const sa_attr_type_t zpl_attr_layout[] = { + ZPL_MODE, + ZPL_SIZE, + ZPL_GEN, + ZPL_UID, + ZPL_GID, + ZPL_PARENT, + ZPL_FLAGS, + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_LINKS, + ZPL_DACL_COUNT, + ZPL_DACL_ACES, + ZPL_SYMLINK, +}; + +/* + * Keys for the ZPL attribute tables in the SA layout ZAP. The first two + * indices are reserved for legacy attribute encoding. + */ +#define SA_LAYOUT_INDEX_DEFAULT 2 +#define SA_LAYOUT_INDEX_SYMLINK 3 + +struct fs_populate_dir { + SLIST_ENTRY(fs_populate_dir) next; + int dirfd; + uint64_t objid; + zfs_zap_t *zap; +}; + +struct fs_populate_arg { + zfs_opt_t *zfs; + zfs_fs_t *fs; /* owning filesystem */ + uint64_t rootdirid; /* root directory dnode ID */ + int rootdirfd; /* root directory fd */ + SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ +}; + +static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); + +static void +eclose(int fd) +{ + if (close(fd) != 0) + err(1, "close"); +} + +static bool +fsnode_isroot(const fsnode *cur) +{ + return (strcmp(cur->name, ".") == 0); +} + +static bool +fsnode_valid(const fsnode *cur) +{ + return (cur->type == S_IFREG || cur->type == S_IFDIR || + cur->type == S_IFLNK); +} + +/* + * Visit each node in a directory hierarchy, in pre-order depth-first order. + */ +static void +fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) +{ + assert(root->type == S_IFDIR); + + for (fsnode *cur = root; cur != NULL; cur = cur->next) { + if (!fsnode_valid(cur)) { + warnx("skipping unhandled %s %s/%s", + inode_type(cur->type), cur->path, cur->name); + continue; + } + if (cb(cur, arg) == 0) + continue; + if (cur->type == S_IFDIR && cur->child != NULL) + fsnode_foreach(cur->child, cb, arg); + } +} + +static void +fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) +{ + struct fs_populate_dir *dir; + uint64_t type; + + switch (cur->type) { + case S_IFREG: + type = DT_REG; + break; + case S_IFDIR: + type = DT_DIR; + break; + case S_IFLNK: + type = DT_LNK; + break; + default: + assert(0); + } + + dir = SLIST_FIRST(&arg->dirs); + zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); +} + +static void +fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, + size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + + memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); + *szp += fs->satab[ind].size; +} + +static void +fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, + size_t valsz, size_t varoff, uint16_t ind, size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + assert(fs->satab[ind].size == 0); + + memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); + *szp += valsz; +} + +/* + * Derive the relative fd/path combo needed to access a file. Ideally we'd + * always be able to use relative lookups (i.e., use the *at() system calls), + * since they require less path translation and are more amenable to sandboxing, + * but the handling of multiple staging directories makes that difficult. To + * make matters worse, we have no choice but to use relative lookups when + * dealing with an mtree manifest, so both mechanisms are implemented. + */ +static void +fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg, + char *path, size_t sz, int *dirfdp) +{ + if (cur->contents != NULL) { + size_t n; + + *dirfdp = AT_FDCWD; + n = strlcpy(path, cur->contents, sz); + assert(n < sz); + } else if (cur->root == NULL) { + size_t n; + + *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd; + n = strlcpy(path, cur->name, sz); + assert(n < sz); + } else { + int n; + + *dirfdp = AT_FDCWD; + n = snprintf(path, sz, "%s/%s/%s", + cur->root, cur->path, cur->name); + assert(n >= 0); + assert((size_t)n < sz); + } +} + +static int +fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags) +{ + char path[PATH_MAX]; + int fd; + + fs_populate_path(cur, arg, path, sizeof(path), &fd); + + fd = openat(fd, path, flags); + if (fd < 0) + err(1, "openat(%s)", path); + return (fd); +} + +static int +fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags) +{ + int fd; + char path[PATH_MAX]; + + fs_populate_path(cur, arg, path, sizeof(path), &fd); + + return (openat(fd, path, flags)); +} + +static void +fs_readlink(const fsnode *cur, struct fs_populate_arg *arg, + char *buf, size_t bufsz) +{ + char path[PATH_MAX]; + int fd; + + if (cur->symlink != NULL) { + size_t n; + + n = strlcpy(buf, cur->symlink, bufsz); + assert(n < bufsz); + } else { + ssize_t n; + + fs_populate_path(cur, arg, path, sizeof(path), &fd); + + n = readlinkat(fd, path, buf, bufsz - 1); + if (n == -1) + err(1, "readlinkat(%s)", cur->name); + buf[n] = '\0'; + } +} + +static void +fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts, + uint16_t ind, size_t *szp) +{ + uint64_t timebuf[2]; + + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + assert(fs->satab[ind].size == sizeof(timebuf)); + + timebuf[0] = ts->tv_sec; + timebuf[1] = ts->tv_nsec; + fs_populate_attr(fs, attrbuf, timebuf, ind, szp); +} + +static void +fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, + dnode_phys_t *dnode) +{ + char target[PATH_MAX]; + zfs_fs_t *fs; + zfs_ace_hdr_t aces[3]; + struct stat *sb; + sa_hdr_phys_t *sahdr; + uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; + char *attrbuf; + size_t bonussz, hdrsz; + int layout; + + assert(dnode->dn_bonustype == DMU_OT_SA); + assert(dnode->dn_nblkptr == 1); + + fs = arg->fs; + sb = &cur->inode->st; + + switch (cur->type) { + case S_IFREG: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = cur->inode->nlink; + objsize = sb->st_size; + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFDIR: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = 1; /* .. */ + objsize = 1; /* .. */ + + if ((cur->inode->flags & FI_ROOT) == 0 ) { + /* + * The size of a ZPL directory is the number of entries + * (including "." and ".."), and the link count is the + * number of entries which are directories + * (including "." and ".."). + */ + for (fsnode *c = + fsnode_isroot(cur) ? cur->next : cur->child; + c != NULL; c = c->next) { + switch (c->type) { + case S_IFDIR: + links++; + /* FALLTHROUGH */ + case S_IFREG: + case S_IFLNK: + objsize++; + break; + } + } + } else { + /* + * Root directory children do belong to + * different dataset and this directory is + * empty in the current objset. + */ + links++; /* . */ + objsize++; /* . */ + } + + /* The root directory is its own parent. */ + parent = SLIST_EMPTY(&arg->dirs) ? + arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFLNK: + fs_readlink(cur, arg, target, sizeof(target)); + + layout = SA_LAYOUT_INDEX_SYMLINK; + links = 1; + objsize = strlen(target); + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + default: + assert(0); + } + + daclcount = nitems(aces); + flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE | + ZFS_AV_MODIFIED; + gen = 1; + gid = sb->st_gid; + mode = sb->st_mode; + uid = sb->st_uid; + + memset(aces, 0, sizeof(aces)); + aces[0].z_flags = ACE_OWNER; + aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | + ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | + ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IRUSR) != 0) + aces[0].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWUSR) != 0) + aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXUSR) != 0) + aces[0].z_access_mask |= ACE_EXECUTE; + + aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; + aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IRGRP) != 0) + aces[1].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWGRP) != 0) + aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXGRP) != 0) + aces[1].z_access_mask |= ACE_EXECUTE; + + aces[2].z_flags = ACE_EVERYONE; + aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IROTH) != 0) + aces[2].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWOTH) != 0) + aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXOTH) != 0) + aces[2].z_access_mask |= ACE_EXECUTE; + + switch (layout) { + case SA_LAYOUT_INDEX_DEFAULT: + /* At most one variable-length attribute. */ + hdrsz = sizeof(uint64_t); + break; + case SA_LAYOUT_INDEX_SYMLINK: + /* At most five variable-length attributes. */ + hdrsz = sizeof(uint64_t) * 2; + break; + default: + assert(0); + } + + sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); + sahdr->sa_magic = SA_MAGIC; + SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); + + bonussz = SA_HDR_SIZE(sahdr); + attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); + + fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); + fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); + fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); + fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); + fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); + fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); + fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); + fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); + fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); + + /* + * We deliberately set atime = mtime here to ensure that images are + * reproducible. + */ + fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); + fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); + fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); +#ifdef __linux__ + /* Linux has no st_birthtim; approximate with st_ctim */ + fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz); +#else + fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); +#endif + + fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, + ZPL_DACL_ACES, &bonussz); + sahdr->sa_lengths[0] = sizeof(aces); + + if (cur->type == S_IFLNK) { + assert(layout == SA_LAYOUT_INDEX_SYMLINK); + /* Need to use a spill block pointer if the target is long. */ + assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); + fs_populate_varszattr(fs, attrbuf, target, objsize, + sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); + sahdr->sa_lengths[1] = (uint16_t)objsize; + } + + dnode->dn_bonuslen = bonussz; +} + +static void +fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) +{ + struct dnode_cursor *c; + dnode_phys_t *dnode; + zfs_opt_t *zfs; + char *buf; + uint64_t dnid; + ssize_t n; + size_t bufsz; + off_t nbytes, reqbytes, size; + int fd; + + assert(cur->type == S_IFREG); + assert((cur->inode->flags & FI_ROOT) == 0); + + zfs = arg->zfs; + + assert(cur->inode->ino != 0); + if ((cur->inode->flags & FI_ALLOCATED) != 0) { + /* + * This is a hard link of an existing file. + * + * XXX-MJ need to check whether it crosses datasets, add a test + * case for that + */ + fs_populate_dirent(arg, cur, cur->inode->ino); + return; + } + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + cur->inode->ino = dnid; + cur->inode->flags |= FI_ALLOCATED; + + fd = fs_open(cur, arg, O_RDONLY); + + buf = zfs->filebuf; + bufsz = sizeof(zfs->filebuf); + size = cur->inode->st.st_size; + c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); + for (off_t foff = 0; foff < size; foff += nbytes) { + off_t loc, sofar; + + /* + * Fill up our buffer, handling partial reads. + */ + sofar = 0; + nbytes = MIN(size - foff, (off_t)bufsz); + do { + n = read(fd, buf + sofar, nbytes); + if (n < 0) + err(1, "reading from '%s'", cur->name); + if (n == 0) + errx(1, "unexpected EOF reading '%s'", + cur->name); + sofar += n; + } while (sofar < nbytes); + + if (nbytes < (off_t)bufsz) + memset(buf + nbytes, 0, bufsz - nbytes); + + reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE; + loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc, + dnode_cursor_next(zfs, c, foff)); + } + eclose(fd); + dnode_cursor_finish(zfs, c); + + fs_populate_sattrs(arg, cur, dnode); + fs_populate_dirent(arg, cur, dnid); +} + +static void +fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + zfs_objset_t *os; + uint64_t dnid; + int dirfd; + + assert(cur->type == S_IFDIR); + assert((cur->inode->flags & FI_ALLOCATED) == 0); + + os = arg->fs->os; + + dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_SA, 0, &dnid); + + /* + * Add an entry to the parent directory and open this directory. + */ + if (!SLIST_EMPTY(&arg->dirs)) { + fs_populate_dirent(arg, cur, dnid); + /* + * We only need the directory fd if we're finding files in + * it. If it's just there for other directories or + * files using contents= we don't need to succeed here. + */ + dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY); + } else { + arg->rootdirid = dnid; + dirfd = arg->rootdirfd; + arg->rootdirfd = -1; + } + + /* + * Set ZPL attributes. + */ + fs_populate_sattrs(arg, cur, dnode); + + /* + * If this is a root directory, then its children belong to a different + * dataset and this directory remains empty in the current objset. + */ + if ((cur->inode->flags & FI_ROOT) == 0) { + struct fs_populate_dir *dir; + + dir = ecalloc(1, sizeof(*dir)); + dir->dirfd = dirfd; + dir->objid = dnid; + dir->zap = zap_alloc(os, dnode); + SLIST_INSERT_HEAD(&arg->dirs, dir, next); + } else { + zap_write(arg->zfs, zap_alloc(os, dnode)); + fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); + } +} + +static void +fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + uint64_t dnid; + + assert(cur->type == S_IFLNK); + assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + + fs_populate_dirent(arg, cur, dnid); + + fs_populate_sattrs(arg, cur, dnode); +} + +static fsnode * +fsnode_next(fsnode *cur) +{ + for (cur = cur->next; cur != NULL; cur = cur->next) { + if (fsnode_valid(cur)) + return (cur); + } + return (NULL); +} + +static int +fs_foreach_populate(fsnode *cur, void *_arg) +{ + struct fs_populate_arg *arg; + struct fs_populate_dir *dir; + int ret; + + arg = _arg; + switch (cur->type) { + case S_IFREG: + fs_populate_file(cur, arg); + break; + case S_IFDIR: + if (fsnode_isroot(cur)) + break; + fs_populate_dir(cur, arg); + break; + case S_IFLNK: + fs_populate_symlink(cur, arg); + break; + default: + assert(0); + } + + ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; + + if (fsnode_next(cur) == NULL && + (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { + /* + * We reached a terminal node in a subtree. Walk back up and + * write out directories. We're done once we hit the root of a + * dataset or find a level where we're not on the edge of the + * tree. + */ + do { + dir = SLIST_FIRST(&arg->dirs); + SLIST_REMOVE_HEAD(&arg->dirs, next); + zap_write(arg->zfs, dir->zap); + if (dir->dirfd != -1) + eclose(dir->dirfd); + free(dir); + cur = cur->parent; + } while (cur != NULL && fsnode_next(cur) == NULL && + (cur->inode->flags & FI_ROOT) == 0); + } + + return (ret); +} + +static void +fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, + const sa_attr_type_t layout[], size_t sacnt) +{ + char ti[16]; + + assert(sizeof(layout[0]) == 2); + + (void)snprintf(ti, sizeof(ti), "%u", index); + zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, + (const uint8_t *)layout); +} + +/* + * Initialize system attribute tables. + * + * There are two elements to this. First, we write the zpl_attrs[] and + * zpl_attr_layout[] tables to disk. Then we create a lookup table which + * allows us to set file attributes quickly. + */ +static uint64_t +fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) +{ + zfs_zap_t *sazap, *salzap, *sarzap; + zfs_objset_t *os; + dnode_phys_t *saobj, *salobj, *sarobj; + uint64_t saobjid, salobjid, sarobjid; + uint16_t offset; + + os = fs->os; + + /* + * The on-disk tables are stored in two ZAP objects, the registry object + * and the layout object. Individual attributes are described by + * entries in the registry object; for example, the value for the + * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. + * The attributes of a file are ordered according to one of the layouts + * defined in the layout object. The master node object is simply used + * to locate the registry and layout objects. + */ + saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); + salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); + sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); + + sarzap = zap_alloc(os, sarobj); + for (size_t i = 0; i < nitems(zpl_attrs); i++) { + const zfs_sattr_t *sa; + uint64_t attr; + + attr = 0; + sa = &zpl_attrs[i]; + SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); + zap_add_uint64(sarzap, sa->name, attr); + } + zap_write(zfs, sarzap); + + /* + * Layouts are arrays of indices into the registry. We define two + * layouts for use by the ZPL, one for non-symlinks and one for + * symlinks. They are identical except that the symlink layout includes + * ZPL_SYMLINK as its final attribute. + */ + salzap = zap_alloc(os, salobj); + assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); + fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, + zpl_attr_layout, nitems(zpl_attr_layout) - 1); + fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, + zpl_attr_layout, nitems(zpl_attr_layout)); + zap_write(zfs, salzap); + + sazap = zap_alloc(os, saobj); + zap_add_uint64(sazap, SA_LAYOUTS, salobjid); + zap_add_uint64(sazap, SA_REGISTRY, sarobjid); + zap_write(zfs, sazap); + + /* Sanity check. */ + for (size_t i = 0; i < nitems(zpl_attrs); i++) + assert(i == zpl_attrs[i].id); + + /* + * Build the offset table used when setting file attributes. File + * attributes are stored in the object's bonus buffer; this table + * provides the buffer offset of attributes referenced by the layout + * table. + */ + fs->sacnt = nitems(zpl_attrs); + fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); + for (size_t i = 0; i < fs->sacnt; i++) + fs->saoffs[i] = 0xffff; + offset = 0; + for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { + uint16_t size; + + assert(zpl_attr_layout[i] < fs->sacnt); + + fs->saoffs[zpl_attr_layout[i]] = offset; + size = zpl_attrs[zpl_attr_layout[i]].size; + offset += size; + } + fs->satab = zpl_attrs; + + return (saobjid); +} + +static void +fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) +{ + char *mountpoint, *origmountpoint, *name, *next; + fsnode *cur, *root; + uint64_t canmount; + + if (!dsl_dir_has_dataset(dsldir)) + return; + + if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) + return; + mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); + if (mountpoint == NULL) + return; + + /* + * If we were asked to specify a bootfs, set it here. + */ + if (zfs->bootfs != NULL && strcmp(zfs->bootfs, + dsl_dir_fullname(dsldir)) == 0) { + zap_add_uint64(zfs->poolprops, "bootfs", + dsl_dir_dataset_id(dsldir)); + } + + origmountpoint = mountpoint; + + /* + * Figure out which fsnode corresponds to our mountpoint. + */ + root = arg; + cur = root; + if (strcmp(mountpoint, zfs->rootpath) != 0) { + mountpoint += strlen(zfs->rootpath); + + /* + * Look up the directory in the staged tree. For example, if + * the dataset's mount point is /foo/bar/baz, we'll search the + * root directory for "foo", search "foo" for "baz", and so on. + * Each intermediate name must refer to a directory; the final + * component need not exist. + */ + cur = root; + for (next = name = mountpoint; next != NULL;) { + for (; *next == '/'; next++) + ; + name = strsep(&next, "/"); + + for (; cur != NULL && strcmp(cur->name, name) != 0; + cur = cur->next) + ; + if (cur == NULL) { + if (next == NULL) + break; + errx(1, "missing mountpoint directory for `%s'", + dsl_dir_fullname(dsldir)); + } + if (cur->type != S_IFDIR) { + errx(1, + "mountpoint for `%s' is not a directory", + dsl_dir_fullname(dsldir)); + } + if (next != NULL) + cur = cur->child; + } + } + + if (cur != NULL) { + assert(cur->type == S_IFDIR); + + /* + * Multiple datasets shouldn't share a mountpoint. It's + * technically allowed, but it's not clear what makefs should do + * in that case. + */ + assert((cur->inode->flags & FI_ROOT) == 0); + if (cur != root) + cur->inode->flags |= FI_ROOT; + assert(cur->inode->param == NULL); + cur->inode->param = dsldir; + } + + free(origmountpoint); +} + +static int +fs_foreach_mark(fsnode *cur, void *arg) +{ + uint64_t *countp; + + countp = arg; + if (cur->type == S_IFDIR && fsnode_isroot(cur)) + return (1); + + if (cur->inode->ino == 0) { + cur->inode->ino = ++(*countp); + cur->inode->nlink = 1; + } else { + cur->inode->nlink++; + } + + return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); +} + +/* + * Create a filesystem dataset. More specifically: + * - create an object set for the dataset, + * - add required metadata (SA tables, property definitions, etc.) to that + * object set, + * - optionally populate the object set with file objects, using "root" as the + * root directory. + * + * "dirfd" is a directory descriptor for the directory referenced by "root". It + * is closed before returning. + */ +static void +fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) +{ + struct fs_populate_arg arg; + zfs_fs_t fs; + zfs_zap_t *masterzap; + zfs_objset_t *os; + dnode_phys_t *deleteq, *masterobj; + uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; + bool fakedroot; + + /* + * This dataset's mountpoint doesn't exist in the staging tree, or the + * dataset doesn't have a mountpoint at all. In either case we still + * need a root directory. Fake up a root fsnode to handle this case. + */ + fakedroot = root == NULL; + if (fakedroot) { + struct stat *stp; + + assert(dirfd == -1); + + root = ecalloc(1, sizeof(*root)); + root->inode = ecalloc(1, sizeof(*root->inode)); + root->name = estrdup("."); + root->type = S_IFDIR; + + stp = &root->inode->st; + stp->st_uid = 0; + stp->st_gid = 0; + stp->st_mode = S_IFDIR | 0755; + } + assert(root->type == S_IFDIR); + assert(fsnode_isroot(root)); + + /* + * Initialize the object set for this dataset. + */ + os = objset_alloc(zfs, DMU_OST_ZFS); + masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); + assert(moid == MASTER_NODE_OBJ); + + memset(&fs, 0, sizeof(fs)); + fs.os = os; + + /* + * Create the ZAP SA layout now since filesystem object dnodes will + * refer to those attributes. + */ + saobjid = fs_set_zpl_attrs(zfs, &fs); + + /* + * Make a pass over the staged directory to detect hard links and assign + * virtual dnode numbers. + */ + dnodecount = 1; /* root directory */ + fsnode_foreach(root, fs_foreach_mark, &dnodecount); + + /* + * Make a second pass to populate the dataset with files from the + * staged directory. Most of our runtime is spent here. + */ + arg.rootdirfd = dirfd; + arg.zfs = zfs; + arg.fs = &fs; + SLIST_INIT(&arg.dirs); + fs_populate_dir(root, &arg); + assert(!SLIST_EMPTY(&arg.dirs)); + fsnode_foreach(root, fs_foreach_populate, &arg); + assert(SLIST_EMPTY(&arg.dirs)); + rootdirid = arg.rootdirid; + + /* + * Create an empty delete queue. We don't do anything with it, but + * OpenZFS will refuse to mount filesystems that don't have one. + */ + deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); + zap_write(zfs, zap_alloc(os, deleteq)); + + /* + * Populate and write the master node object. This is a ZAP object + * containing various dataset properties and the object IDs of the root + * directory and delete queue. + */ + masterzap = zap_alloc(os, masterobj); + zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); + zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); + zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); + zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); + zap_add_uint64(masterzap, "normalization", 0 /* off */); + zap_add_uint64(masterzap, "utf8only", 0 /* off */); + zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); + zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); + zap_write(zfs, masterzap); + + /* + * All finished with this object set, we may as well write it now. + * The DSL layer will sum up the bytes consumed by each dataset using + * information stored in the object set, so it can't be freed just yet. + */ + dsl_dir_dataset_write(zfs, os, dsldir); + + if (fakedroot) { + free(root->inode); + free(root->name); + free(root); + } + free(fs.saoffs); +} + +/* + * Create an object set for each DSL directory which has a dataset and doesn't + * already have an object set. + */ +static void +fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) +{ + if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) + fs_build_one(zfs, dsldir, NULL, -1); +} + +/* + * Create our datasets and populate them with files. + */ +void +fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) +{ + /* + * Run through our datasets and find the root fsnode for each one. Each + * root fsnode is flagged so that we can figure out which dataset it + * belongs to. + */ + dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); + + /* + * Did we find our boot filesystem? + */ + if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) + errx(1, "no mounted dataset matches bootfs property `%s'", + zfs->bootfs); + + /* + * Traverse the file hierarchy starting from the root fsnode. One + * dataset, not necessarily the root dataset, must "own" the root + * directory by having its mountpoint be equal to the root path. + * + * As roots of other datasets are encountered during the traversal, + * fs_build_one() recursively creates the corresponding object sets and + * populates them. Once this function has returned, all datasets will + * have been fully populated. + */ + fs_build_one(zfs, root->inode->param, root, dirfd); + + /* + * Now create object sets for datasets whose mountpoints weren't found + * in the staging directory, either because there is no mountpoint, or + * because the mountpoint doesn't correspond to an existing directory. + */ + dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); +} |