diff options
author | Mark Johnston <markj@FreeBSD.org> | 2022-08-05 17:07:54 +0000 |
---|---|---|
committer | Mark Johnston <markj@FreeBSD.org> | 2022-08-05 17:42:29 +0000 |
commit | 240afd8c1fcc8c5f29dbd4ff0c915795d414405d (patch) | |
tree | e9be60fe4e9d1090c12b76221d37e2f9adeec52a | |
parent | 3e1101f29b4ff39a67eb10a5b41b727d8702f0f5 (diff) | |
download | src-240afd8c1fcc.tar.gz src-240afd8c1fcc.zip |
makefs: Add ZFS support
This allows one to take a staged directory tree and create a file
consisting of a ZFS pool with one or more datasets that contain the
contents of the directory tree. This is useful for creating virtual
machine images without using the kernel to create a pool; "zpool create"
requires root privileges and currently is not permitted in jails.
makefs -t zfs also provides reproducible images by using a fixed seed
for pseudo-random number generation, used for generating GUIDs and hash
salts. makefs -t zfs requires relatively little by way of machine
resources.
The "zpool_reguid" rc.conf setting can be used to ask a FreeBSD guest to
generate a unique pool GUID upon first boot.
A small number of pool and dataset properties are supported. The pool
is backed by a single disk vdev. Data is always checksummed using
Fletcher-4, no redundant copies are made, and no compression is used.
The manual page documents supported pool and filesystem properties.
The implementation uses a few pieces of ZFS support from with the boot
loader, especially definitions for various on-disk structures, but is
otherwise standalone and in particular doesn't depend on OpenZFS.
This feature should be treated as experimental for now, i.e., important
data shouldn't be trusted to a makefs-created pool, and the command-line
interface is subject to change.
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D35248
-rw-r--r-- | usr.sbin/makefs/Makefile | 11 | ||||
-rw-r--r-- | usr.sbin/makefs/makefs.8 | 97 | ||||
-rw-r--r-- | usr.sbin/makefs/makefs.c | 3 | ||||
-rw-r--r-- | usr.sbin/makefs/makefs.h | 5 | ||||
-rw-r--r-- | usr.sbin/makefs/tests/Makefile | 1 | ||||
-rw-r--r-- | usr.sbin/makefs/tests/makefs_zfs_tests.sh | 634 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs.c | 758 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/Makefile.inc | 12 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/dsl.c | 598 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/fs.c | 981 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/objset.c | 259 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/vdev.c | 435 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/zap.c | 551 | ||||
-rw-r--r-- | usr.sbin/makefs/zfs/zfs.h | 167 |
14 files changed, 4509 insertions, 3 deletions
diff --git a/usr.sbin/makefs/Makefile b/usr.sbin/makefs/Makefile index 3fea648f9383..fe472d7e7309 100644 --- a/usr.sbin/makefs/Makefile +++ b/usr.sbin/makefs/Makefile @@ -19,6 +19,17 @@ MAN= makefs.8 NO_WCAST_ALIGN= CSTD= c11 +.if ${MK_ZFS} != "no" +SRCS+= zfs.c +CFLAGS+=-I${SRCDIR}/zfs \ + -I${SRCTOP}/stand/libsa \ + -I${SRCTOP}/sys/cddl/boot + +CFLAGS+= -DHAVE_ZFS + +.include "${SRCDIR}/zfs/Makefile.inc" +.endif + .include "${SRCDIR}/cd9660/Makefile.inc" .include "${SRCDIR}/ffs/Makefile.inc" .include "${SRCDIR}/msdos/Makefile.inc" diff --git a/usr.sbin/makefs/makefs.8 b/usr.sbin/makefs/makefs.8 index fdf8d532b69f..464583eab3a1 100644 --- a/usr.sbin/makefs/makefs.8 +++ b/usr.sbin/makefs/makefs.8 @@ -35,7 +35,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 17, 2020 +.Dd August 5, 2022 .Dt MAKEFS 8 .Os .Sh NAME @@ -266,6 +266,8 @@ BSD fast file system (default). ISO 9660 file system. .It Sy msdos FAT12, FAT16, or FAT32 file system. +.It Sy zfs +ZFS pool containing one or more file systems. .El .It Fl x Exclude file system nodes not explicitly listed in the specfile. @@ -494,10 +496,97 @@ Volume ID. .It Cm volume_label Volume Label. .El +.Ss zfs-specific options +Note: ZFS support is currently considered experimental. +Do not use it for anything critical. +.Pp +The image created by +.Nm +contains a ZFS pool with a single vdev of type +.Ql disk . +The root dataset is always created implicitly and contains the entire input +directory tree unless additional datasets are specified using the options +described below. +.Pp +The arguments consist of a keyword, an equal sign +.Pq Ql = , +and a value. +The following keywords are supported: +.Pp +.Bl -tag -width omit-trailing-period -offset indent -compact +.It ashift +The base-2 logarithm of the minimum block size. +Typical values are 9 (512B blocks) and 12 (4KB blocks). +The default value is 12. +.It bootfs +The name of the bootable dataset for the pool. +Specifying this option causes the +.Ql bootfs +property to be set in the created pool. +.It mssize +The size of metaslabs in the created pool. +By default, +.Nm +allocates large (up to 512MB) metaslabs with the expectation that +the image will be auto-expanded upon first use. +This option allows the default heuristic to be overridden. +.It poolname +The name of the ZFS pool. +This option must be specified. +.It rootpath +An implicit path prefix added to dataset mountpoints. +By default it is +.Pa /<poolname> . +For creating bootable pools, the +.Va rootpath +should be set to +.Pa / . +At least one dataset must have a mountpoint equal to +.Va rootpath . +.It fs +Create an additional dataset. +This option may be specified multiple times. +The argument value must be of the form +.Ar <dataset>[;<prop1=v1>[;<prop2=v2>[;...]]] , +where +.Ar dataset +is the name of the dataset and must belong to the pool's namespace. +For example, with a pool name of +.Ql test +all dataset names must be prefixed by +.Ql test/ . +A dataset must exist at each level of the pool's namespace. +For example, to create +.Ql test/foo/bar , +.Ql test/foo +must be created as well. +.Pp +The dataset mountpoints determine how the datasets are populated with +files from the staged directory tree. +Conceptually, all datasets are mounted before any are populated with files. +The root of the staged directory tree is mapped to +.Va rootpath . +.Pp +Dataset properties, as described in +.Xr zfsprops 8 , +may be specified following the dataset name. +The following properties may be set for a dataset: +.Pp +.Bl -tag -compact -offset indent +.It atime +.It canmount +.It exec +.It mountpoint +.It setuid +.El +.El .Sh SEE ALSO .Xr mtree 5 , .Xr mtree 8 , -.Xr newfs 8 +.Xr newfs 8 , +.Xr zfsconcepts 8 , +.Xr zfsprops 8 , +.Xr zpoolprops 8 .Sh HISTORY The .Nm @@ -518,4 +607,6 @@ and first appeared in .An Ram Vedam (cd9660 support), .An Christos Zoulas -(msdos support). +(msdos support), +.An Mark Johnston +(zfs support). diff --git a/usr.sbin/makefs/makefs.c b/usr.sbin/makefs/makefs.c index 888a2b3edea7..2a50768d3152 100644 --- a/usr.sbin/makefs/makefs.c +++ b/usr.sbin/makefs/makefs.c @@ -77,6 +77,9 @@ static fstype_t fstypes[] = { ENTRY(cd9660), ENTRY(ffs), ENTRY(msdos), +#ifdef HAVE_ZFS + ENTRY(zfs), +#endif { .type = NULL }, }; diff --git a/usr.sbin/makefs/makefs.h b/usr.sbin/makefs/makefs.h index 68dc0362dd21..e88313e8366d 100644 --- a/usr.sbin/makefs/makefs.h +++ b/usr.sbin/makefs/makefs.h @@ -78,12 +78,14 @@ enum fi_flags { FI_SIZED = 1<<0, /* inode sized */ FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */ FI_WRITTEN = 1<<2, /* inode written */ + FI_ROOT = 1<<3, /* root of a ZFS dataset */ }; typedef struct { uint32_t ino; /* inode number used on target fs */ uint32_t nlink; /* number of links to this entry */ enum fi_flags flags; /* flags used by fs specific code */ + void *param; /* for use by individual fs impls */ struct stat st; /* stat entry */ } fsinode; @@ -186,6 +188,9 @@ void fs ## _makefs(const char *, const char *, fsnode *, fsinfo_t *) DECLARE_FUN(cd9660); DECLARE_FUN(ffs); DECLARE_FUN(msdos); +#ifdef HAVE_ZFS +DECLARE_FUN(zfs); +#endif extern u_int debug; extern int dupsok; diff --git a/usr.sbin/makefs/tests/Makefile b/usr.sbin/makefs/tests/Makefile index 85e4b233aea7..c2c9f6bea5b6 100644 --- a/usr.sbin/makefs/tests/Makefile +++ b/usr.sbin/makefs/tests/Makefile @@ -2,6 +2,7 @@ ATF_TESTS_SH+= makefs_cd9660_tests ATF_TESTS_SH+= makefs_ffs_tests +ATF_TESTS_SH+= makefs_zfs_tests BINDIR= ${TESTSDIR} diff --git a/usr.sbin/makefs/tests/makefs_zfs_tests.sh b/usr.sbin/makefs/tests/makefs_zfs_tests.sh new file mode 100644 index 000000000000..8cd79966c49a --- /dev/null +++ b/usr.sbin/makefs/tests/makefs_zfs_tests.sh @@ -0,0 +1,634 @@ +#- +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright (c) 2022 The FreeBSD Foundation +# +# This software was developed by Mark Johnston under sponsorship from +# the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +MAKEFS="makefs -t zfs -o nowarn=true" +ZFS_POOL_NAME="makefstest$$" +TEST_ZFS_POOL_NAME="$TMPDIR/poolname" + +. "$(dirname "$0")/makefs_tests_common.sh" + +common_cleanup() +{ + local pool md + + # Try to force a TXG, this can help catch bugs by triggering a panic. + sync + + pool=$(cat $TEST_ZFS_POOL_NAME) + if zpool list "$pool" >/dev/null; then + zpool destroy "$pool" + fi + + md=$(cat $TEST_MD_DEVICE_FILE) + if [ -c /dev/"$md" ]; then + mdconfig -d -u "$md" + fi +} + +import_image() +{ + atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \ + mdconfig -a -f $TEST_IMAGE + atf_check zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME + echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME +} + +# +# Test autoexpansion of the vdev. +# +# The pool is initially 10GB, so we get 10GB minus one metaslab's worth of +# usable space for data. Then the pool is expanded to 50GB, and the amount of +# usable space is 50GB minus one metaslab. +# +atf_test_case autoexpand cleanup +autoexpand_body() +{ + local mssize poolsize poolsize1 newpoolsize + + create_test_inputs + + mssize=$((128 * 1024 * 1024)) + poolsize=$((10 * 1024 * 1024 * 1024)) + atf_check $MAKEFS -s $poolsize -o mssize=$mssize -o rootpath=/ \ + -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + newpoolsize=$((50 * 1024 * 1024 * 1024)) + truncate -s $newpoolsize $TEST_IMAGE + + import_image + + check_image_contents + + poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME) + atf_check [ $((poolsize1 + $mssize)) -eq $poolsize ] + + atf_check zpool online -e $ZFS_POOL_NAME /dev/$(cat $TEST_MD_DEVICE_FILE) + + check_image_contents + + poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME) + atf_check [ $((poolsize1 + $mssize)) -eq $newpoolsize ] +} +autoexpand_cleanup() +{ + common_cleanup +} + +# +# Test with some default layout defined by the common code. +# +atf_test_case basic cleanup +basic_body() +{ + create_test_inputs + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +basic_cleanup() +{ + common_cleanup +} + +atf_test_case dataset_removal cleanup +dataset_removal_body() +{ + create_test_dirs + + cd $TEST_INPUTS_DIR + mkdir dir + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check zfs destroy ${ZFS_POOL_NAME}/dir +} +dataset_removal_cleanup() +{ + common_cleanup +} + +# +# Make sure that we can create and remove an empty directory. +# +atf_test_case empty_dir cleanup +empty_dir_body() +{ + create_test_dirs + + cd $TEST_INPUTS_DIR + mkdir dir + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check rmdir ${TEST_MOUNT_DIR}/dir +} +empty_dir_cleanup() +{ + common_cleanup +} + +atf_test_case empty_fs cleanup +empty_fs_body() +{ + create_test_dirs + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +empty_fs_cleanup() +{ + common_cleanup +} + +atf_test_case file_sizes cleanup +file_sizes_body() +{ + local i + + create_test_dirs + cd $TEST_INPUTS_DIR + + i=1 + while [ $i -lt $((1 << 20)) ]; do + truncate -s $i ${i}.1 + truncate -s $(($i - 1)) ${i}.2 + truncate -s $(($i + 1)) ${i}.3 + i=$(($i << 1)) + done + + cd - + + # XXXMJ this creates sparse files, make sure makefs doesn't + # preserve the sparseness. + # XXXMJ need to test with larger files (at least 128MB for L2 indirs) + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +file_sizes_cleanup() +{ + common_cleanup +} + +atf_test_case hard_links cleanup +hard_links_body() +{ + local f + + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + echo "hello" > 1 + ln 1 2 + ln 1 dir/1 + + echo "goodbye" > dir/a + ln dir/a dir/b + ln dir/a a + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino + stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink + for f in 1 2 dir/1; do + atf_check -o file:./nlink -e empty -s exit:0 \ + stat -f '%l' ${TEST_MOUNT_DIR}/${f} + atf_check -o file:./ino -e empty -s exit:0 \ + stat -f '%i' ${TEST_MOUNT_DIR}/${f} + atf_check cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f} + done + + stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino + stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink + for f in dir/a dir/b a; do + atf_check -o file:./nlink -e empty -s exit:0 \ + stat -f '%l' ${TEST_MOUNT_DIR}/${f} + atf_check -o file:./ino -e empty -s exit:0 \ + stat -f '%i' ${TEST_MOUNT_DIR}/${f} + atf_check cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f} + done +} +hard_links_cleanup() +{ + common_cleanup +} + +# Allocate enough dnodes from an object set that the meta dnode needs to use +# indirect blocks. +atf_test_case indirect_dnode_array cleanup +indirect_dnode_array_body() +{ + local count i + + # How many dnodes do we need to allocate? Well, the data block size + # for meta dnodes is always 16KB, so with a dnode size of 512B we get + # 32 dnodes per direct block. The maximum indirect block size is 128KB + # and that can fit 1024 block pointers, so we need at least 32 * 1024 + # files to force the use of two levels of indirection. + # + # Unfortunately that number of files makes the test run quite slowly, + # so we settle for a single indirect block for now... + count=$(jot -r 1 32 1024) + + create_test_dirs + cd $TEST_INPUTS_DIR + for i in $(seq 1 $count); do + touch $i + done + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +indirect_dnode_array_cleanup() +{ + common_cleanup +} + +# +# Create some files with long names, so as to test fat ZAP handling. +# +atf_test_case long_file_name cleanup +long_file_name_body() +{ + local dir i + + create_test_dirs + cd $TEST_INPUTS_DIR + + # micro ZAP keys can be at most 50 bytes. + for i in $(seq 1 60); do + touch $(jot -s '' $i 1 1) + done + dir=$(jot -s '' 61 1 1) + mkdir $dir + for i in $(seq 1 60); do + touch ${dir}/$(jot -s '' $i 1 1) + done + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + # Add a directory entry in the hope that OpenZFS might catch a bug + # in makefs' fat ZAP encoding. + touch ${TEST_MOUNT_DIR}/foo +} +long_file_name_cleanup() +{ + common_cleanup +} + +# +# Exercise handling of multiple datasets. +# +atf_test_case multi_dataset_1 cleanup +multi_dataset_1_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + mkdir dir2 + echo b > dir2/b + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + # Make sure that we have three datasets with the expected mount points. + atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME} + atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME} + + atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME}/dir1 + atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME}/dir2 + atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2 +} +multi_dataset_1_cleanup() +{ + common_cleanup +} + +# +# Create a pool with two datasets, where the root dataset is mounted below +# the child dataset. +# +atf_test_case multi_dataset_2 cleanup +multi_dataset_2_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + mkdir dir2 + echo b > dir2/b + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1\;mountpoint=/ \ + -o fs=${ZFS_POOL_NAME}\;mountpoint=/dir1 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +multi_dataset_2_cleanup() +{ + common_cleanup +} + +# +# Create a dataset with a non-existent mount point. +# +atf_test_case multi_dataset_3 cleanup +multi_dataset_3_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1 \ + -o fs=${ZFS_POOL_NAME}/dir2 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2 + + # Mounting dir2 should have created a directory called dir2. Go + # back and create it in the staging tree before comparing. + atf_check mkdir ${TEST_INPUTS_DIR}/dir2 + + check_image_contents +} +multi_dataset_3_cleanup() +{ + common_cleanup +} + +# +# Create an unmounted dataset. +# +atf_test_case multi_dataset_4 cleanup +multi_dataset_4_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1\;canmount=noauto\;mountpoint=none \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check -o inline:none\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + check_image_contents + + atf_check zfs set mountpoint=/dir1 ${ZFS_POOL_NAME}/dir1 + atf_check zfs mount ${ZFS_POOL_NAME}/dir1 + atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + # dir1/a should be part of the root dataset, not dir1. + atf_check -s not-exit:0 -e not-empty stat ${TEST_MOUNT_DIR}dir1/a +} +multi_dataset_4_cleanup() +{ + common_cleanup +} + +# +# Rudimentary test to verify that two ZFS images created using the same +# parameters and input hierarchy are byte-identical. In particular, makefs(1) +# does not preserve file access times. +# +atf_test_case reproducible cleanup +reproducible_body() +{ + create_test_inputs + + atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + ${TEST_IMAGE}.1 $TEST_INPUTS_DIR + + atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + ${TEST_IMAGE}.2 $TEST_INPUTS_DIR + + # XXX-MJ cmp(1) is really slow + atf_check cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2 +} +reproducible_cleanup() +{ +} + +# +# Verify that we can take a snapshot of a generated dataset. +# +atf_test_case snapshot cleanup +snapshot_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + echo "hello" > dir/hello + echo "goodbye" > goodbye + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check zfs snapshot ${ZFS_POOL_NAME}@1 +} +snapshot_cleanup() +{ + common_cleanup +} + +# +# Check handling of symbolic links. +# +atf_test_case soft_links cleanup +soft_links_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + ln -s a a + ln -s dir/../a a + ln -s dir/b b + echo 'c' > dir + ln -s dir/c c + # XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1 + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +soft_links_cleanup() +{ + common_cleanup +} + +# +# Verify that we can set properties on the root dataset. +# +atf_test_case root_props cleanup +root_props_body() +{ + create_test_inputs + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}\;atime=off\;setuid=off \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check -o inline:off\\n -e empty -s exit:0 \ + zfs get -H -o value atime $ZFS_POOL_NAME + atf_check -o inline:local\\n -e empty -s exit:0 \ + zfs get -H -o source atime $ZFS_POOL_NAME + atf_check -o inline:off\\n -e empty -s exit:0 \ + zfs get -H -o value setuid $ZFS_POOL_NAME + atf_check -o inline:local\\n -e empty -s exit:0 \ + zfs get -H -o source setuid $ZFS_POOL_NAME +} +root_props_cleanup() +{ + common_cleanup +} + +atf_init_test_cases() +{ + atf_add_test_case autoexpand + atf_add_test_case basic + atf_add_test_case dataset_removal + atf_add_test_case empty_dir + atf_add_test_case empty_fs + atf_add_test_case file_sizes + atf_add_test_case hard_links + atf_add_test_case indirect_dnode_array + atf_add_test_case long_file_name + atf_add_test_case multi_dataset_1 + atf_add_test_case multi_dataset_2 + atf_add_test_case multi_dataset_3 + atf_add_test_case multi_dataset_4 + atf_add_test_case reproducible + atf_add_test_case snapshot + atf_add_test_case soft_links + atf_add_test_case root_props + + # XXXMJ tests: + # - test with different ashifts (at least, 9 and 12), different image sizes + # - create datasets in imported pool +} diff --git a/usr.sbin/makefs/zfs.c b/usr.sbin/makefs/zfs.c new file mode 100644 index 000000000000..08689a558870 --- /dev/null +++ b/usr.sbin/makefs/zfs.c @@ -0,0 +1,758 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/queue.h> + +#include <assert.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <util.h> + +#include "makefs.h" +#include "zfs.h" + +#define VDEV_LABEL_SPACE \ + ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) +_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); + +#define MINMSSIZE ((off_t)1 << 24) /* 16MB */ +#define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ +#define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ + +#define INDIR_LEVELS 6 +/* Indirect blocks are always 128KB. */ +#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) + +struct dnode_cursor { + char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; + off_t indloc; + off_t indspace; + dnode_phys_t *dnode; + off_t dataoff; + off_t datablksz; +}; + +void +zfs_prep_opts(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs)); + + const option_t zfs_options[] = { + { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, + 0, 0, "Bootable dataset" }, + { '\0', "mssize", &zfs->mssize, OPT_INT64, + MINMSSIZE, MAXMSSIZE, "Metaslab size" }, + { '\0', "poolname", &zfs->poolname, OPT_STRPTR, + 0, 0, "ZFS pool name" }, + { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, + 0, 0, "Prefix for all dataset mount points" }, + { '\0', "ashift", &zfs->ashift, OPT_INT32, + MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, + { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, + 0, 0, "Suppress warning about experimental ZFS support" }, + { .name = NULL } + }; + + STAILQ_INIT(&zfs->datasetdescs); + + fsopts->fs_specific = zfs; + fsopts->fs_options = copy_opts(zfs_options); +} + +int +zfs_parse_opts(const char *option, fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + struct dataset_desc *dsdesc; + char buf[BUFSIZ], *opt, *val; + int rv; + + zfs = fsopts->fs_specific; + + opt = val = estrdup(option); + opt = strsep(&val, "="); + if (strcmp(opt, "fs") == 0) { + if (val == NULL) + errx(1, "invalid filesystem parameters `%s'", option); + + /* + * Dataset descriptions will be parsed later, in dsl_init(). + * Just stash them away for now. + */ + dsdesc = ecalloc(1, sizeof(*dsdesc)); + dsdesc->params = estrdup(val); + free(opt); + STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); + return (1); + } + free(opt); + + rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); + return (rv == -1 ? 0 : 1); +} + +static void +zfs_size_vdev(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + off_t asize, mssize, vdevsize, vdevsize1; + + zfs = fsopts->fs_specific; + + assert(fsopts->maxsize != 0); + assert(zfs->ashift != 0); + + /* + * Figure out how big the vdev should be. + */ + vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); + if (vdevsize < MINDEVSIZE) + errx(1, "maximum image size is too small"); + if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { + errx(1, "image size bounds must be multiples of %d", + 1 << zfs->ashift); + } + asize = vdevsize - VDEV_LABEL_SPACE; + + /* + * Size metaslabs according to the following heuristic: + * - provide at least 8 metaslabs, + * - without using a metaslab size larger than 512MB. + * This approximates what OpenZFS does without being complicated. In + * practice we expect pools to be expanded upon first use, and OpenZFS + * does not resize metaslabs in that case, so there is no right answer + * here. In general we want to provide large metaslabs even if the + * image size is small, and 512MB is a reasonable size for pools up to + * several hundred gigabytes. + * + * The user may override this heuristic using the "-o mssize" option. + */ + mssize = zfs->mssize; + if (mssize == 0) { + mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); + if (!powerof2(mssize)) + mssize = 1l << (flsll(mssize) - 1); + } + if (!powerof2(mssize)) + errx(1, "metaslab size must be a power of 2"); + + /* + * If we have some slop left over, try to cover it by resizing the vdev, + * subject to the maxsize and minsize parameters. + */ + if (asize % mssize != 0) { + vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; + if (vdevsize1 < fsopts->minsize) + vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; + if (vdevsize1 <= fsopts->maxsize) + vdevsize = vdevsize1; + } + asize = vdevsize - VDEV_LABEL_SPACE; + + zfs->asize = asize; + zfs->vdevsize = vdevsize; + zfs->mssize = mssize; + zfs->msshift = flsll(mssize) - 1; + zfs->mscount = asize / mssize; +} + +/* + * Validate options and set some default values. + */ +static void +zfs_check_opts(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + + zfs = fsopts->fs_specific; + + if (fsopts->offset != 0) + errx(1, "unhandled offset option"); + if (fsopts->maxsize == 0) + errx(1, "an image size must be specified"); + + if (zfs->poolname == NULL) + errx(1, "a pool name must be specified"); + + if (zfs->rootpath == NULL) + easprintf(&zfs->rootpath, "/%s", zfs->poolname); + if (zfs->rootpath[0] != '/') + errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); + + if (zfs->ashift == 0) + zfs->ashift = 12; + + zfs_size_vdev(fsopts); +} + +void +zfs_cleanup_opts(fsinfo_t *fsopts) +{ + struct dataset_desc *d, *tmp; + zfs_opt_t *zfs; + + zfs = fsopts->fs_specific; + free(zfs->rootpath); + free(zfs->bootfs); + free(__DECONST(void *, zfs->poolname)); + STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { + free(d->params); + free(d); + } + free(zfs); + free(fsopts->fs_options); +} + +static size_t +nvlist_size(const nvlist_t *nvl) +{ + return (sizeof(nvl->nv_header) + nvl->nv_size); +} + +static void +nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) +{ + assert(sz >= nvlist_size(nvl)); + + memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); + memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); +} + +static nvlist_t * +pool_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *featuresnv, *poolnv; + + poolnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); + nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); + + featuresnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); + nvlist_destroy(featuresnv); + + return (poolnv); +} + +static nvlist_t * +pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *diskvdevnv; + + assert(zfs->objarrid != 0); + + diskvdevnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); + nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, + zfs->objarrid); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, + zfs->msshift); + + return (diskvdevnv); +} + +static nvlist_t * +pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *diskvdevnv, *rootvdevnv; + + diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); + rootvdevnv = nvlist_create(NV_UNIQUE_NAME); + + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); + nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); + nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, + 1); + nvlist_destroy(diskvdevnv); + + return (rootvdevnv); +} + +/* + * Create the pool's "config" object, which contains an nvlist describing pool + * parameters and the vdev topology. It is similar but not identical to the + * nvlist stored in vdev labels. The main difference is that vdev labels do not + * describe the full vdev tree and in particular do not contain the "root" + * meta-vdev. + */ +static void +pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + nvlist_t *poolconfig, *vdevconfig; + void *configbuf; + uint64_t dnid; + off_t configloc, configblksz; + int error; + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, + DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); + + poolconfig = pool_config_nvcreate(zfs); + + vdevconfig = pool_root_vdev_config_nvcreate(zfs); + nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); + nvlist_destroy(vdevconfig); + + error = nvlist_export(poolconfig); + if (error != 0) + errc(1, error, "nvlist_export"); + + configblksz = nvlist_size(poolconfig); + configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); + configbuf = ecalloc(1, configblksz); + nvlist_copy(poolconfig, configbuf, configblksz); + + vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); + + dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); + + zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); + + nvlist_destroy(poolconfig); + free(configbuf); +} + +/* + * Add objects block pointer list objects, used for deferred frees. We don't do + * anything with them, but they need to be present or OpenZFS will refuse to + * import the pool. + */ +static void +pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) +{ + uint64_t dnid; + + (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, + BPOBJ_SIZE_V2, &dnid); + zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); + + (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, + BPOBJ_SIZE_V2, &dnid); + zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); +} + +/* + * Add required feature metadata objects. We don't know anything about ZFS + * features, so the objects are just empty ZAPs. + */ +static void +pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + uint64_t dnid; + + dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); +} + +static void +pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, + dsl_dir_id(zfs->rootdsldir)); +} + +static void +pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + uint64_t id; + + dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); + zap_add_uint64(objdir, DMU_POOL_PROPS, id); + + zfs->poolprops = zap_alloc(zfs->mos, dnode); +} + +/* + * Initialize the MOS object directory, the root of virtually all of the pool's + * data and metadata. + */ +static void +pool_init_objdir(zfs_opt_t *zfs) +{ + zfs_zap_t *zap; + dnode_phys_t *objdir; + + objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); + + zap = zap_alloc(zfs->mos, objdir); + pool_init_objdir_config(zfs, zap); + pool_init_objdir_bplists(zfs, zap); + pool_init_objdir_feature_maps(zfs, zap); + pool_init_objdir_dsl(zfs, zap); + pool_init_objdir_poolprops(zfs, zap); + zap_write(zfs, zap); +} + +/* + * Initialize the meta-object set (MOS) and immediately write out several + * special objects whose contents are already finalized, including the object + * directory. + * + * Once the MOS is finalized, it'll look roughly like this: + * + * object directory (ZAP) + * |-> vdev config object (nvlist) + * |-> features for read + * |-> features for write + * |-> feature descriptions + * |-> sync bplist + * |-> free bplist + * |-> pool properties + * L-> root DSL directory + * |-> DSL child directory (ZAP) + * | |-> $MOS (DSL dir) + * | | |-> child map + * | | L-> props (ZAP) + * | |-> $FREE (DSL dir) + * | | |-> child map + * | | L-> props (ZAP) + * | |-> $ORIGIN (DSL dir) + * | | |-> child map + * | | |-> dataset + * | | | L-> deadlist + * | | |-> snapshot + * | | | |-> deadlist + * | | | L-> snapshot names + * | | |-> props (ZAP) + * | | L-> clones (ZAP) + * | |-> dataset 1 (DSL dir) + * | | |-> DSL dataset + * | | | |-> snapshot names + * | | | L-> deadlist + * | | |-> child map + * | | | L-> ... + * | | L-> props + * | |-> dataset 2 + * | | L-> ... + * | |-> ... + * | L-> dataset n + * |-> DSL root dataset + * | |-> snapshot names + * | L-> deadlist + * L-> props (ZAP) + * space map object array + * |-> space map 1 + * |-> space map 2 + * |-> ... + * L-> space map n (zfs->mscount) + * + * The space map object array is pointed to by the "msarray" property in the + * pool configuration. + */ +static void +pool_init(zfs_opt_t *zfs) +{ + uint64_t dnid; + + zfs->poolguid = ((uint64_t)random() << 32) | random(); + zfs->vdevguid = ((uint64_t)random() << 32) | random(); + + zfs->mos = objset_alloc(zfs, DMU_OST_META); + + (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); + assert(dnid == DMU_POOL_DIRECTORY_OBJECT); + + (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); + + dsl_init(zfs); + + pool_init_objdir(zfs); +} + +static void +pool_labels_write(zfs_opt_t *zfs) +{ + uberblock_t *ub; + vdev_label_t *label; + nvlist_t *poolconfig, *vdevconfig; + int error; + + label = ecalloc(1, sizeof(*label)); + + /* + * Assemble the vdev configuration and store it in the label. + */ + poolconfig = pool_config_nvcreate(zfs); + vdevconfig = pool_disk_vdev_config_nvcreate(zfs); + nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); + nvlist_destroy(vdevconfig); + + error = nvlist_export(poolconfig); + if (error != 0) + errc(1, error, "nvlist_export"); + nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, + sizeof(label->vl_vdev_phys.vp_nvlist)); + nvlist_destroy(poolconfig); + + /* + * Fill out the uberblock. Just make each one the same. The embedded + * checksum is calculated in vdev_label_write(). + */ + for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); + uoff += (1 << zfs->ashift)) { + ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_version = SPA_VERSION; + ub->ub_txg = TXG; + ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; + ub->ub_timestamp = 0; + + ub->ub_software_version = SPA_VERSION; + ub->ub_mmp_magic = MMP_MAGIC; + ub->ub_mmp_delay = 0; + ub->ub_mmp_config = 0; + ub->ub_checkpoint_txg = 0; + objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); + } + + /* + * Write out four copies of the label: two at the beginning of the vdev + * and two at the end. + */ + for (int i = 0; i < VDEV_LABELS; i++) + vdev_label_write(zfs, i, label); + + free(label); +} + +static void +pool_fini(zfs_opt_t *zfs) +{ + zap_write(zfs, zfs->poolprops); + dsl_write(zfs); + objset_write(zfs, zfs->mos); + pool_labels_write(zfs); +} + +struct dnode_cursor * +dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, + off_t size, off_t blksz) +{ + struct dnode_cursor *c; + uint64_t nbppindir, indlevel, ndatablks, nindblks; + + assert(dnode->dn_nblkptr == 1); + assert(blksz <= MAXBLOCKSIZE); + + if (blksz == 0) { + /* Must be between 1<<ashift and 128KB. */ + blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift, + powerof2(size) ? size : (1ul << flsll(size)))); + } + assert(powerof2(blksz)); + + /* + * Do we need indirect blocks? Figure out how many levels are needed + * (indlevel == 1 means no indirect blocks) and how much space is needed + * (it has to be allocated up-front to break the dependency cycle + * described in objset_write()). + */ + ndatablks = size == 0 ? 0 : howmany(size, blksz); + nindblks = 0; + for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { + nbppindir *= BLKPTR_PER_INDIR; + nindblks += howmany(ndatablks, indlevel * nbppindir); + } + assert(indlevel < INDIR_LEVELS); + + dnode->dn_nlevels = (uint8_t)indlevel; + dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; + dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; + + c = ecalloc(1, sizeof(*c)); + if (nindblks > 0) { + c->indspace = nindblks * MAXBLOCKSIZE; + c->indloc = objset_space_alloc(zfs, os, &c->indspace); + } + c->dnode = dnode; + c->dataoff = 0; + c->datablksz = blksz; + + return (c); +} + +static void +_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) +{ + blkptr_t *bp, *pbp; + void *buf; + uint64_t fill; + off_t blkid, blksz, loc; + + assert(levels > 0); + assert(levels <= c->dnode->dn_nlevels - 1); + + blksz = MAXBLOCKSIZE; + blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; + for (int level = 1; level <= levels; level++) { + buf = c->inddir[level - 1]; + + if (level == c->dnode->dn_nlevels - 1) { + pbp = &c->dnode->dn_blkptr[0]; + } else { + uint64_t iblkid; + + iblkid = blkid & (BLKPTR_PER_INDIR - 1); + pbp = (blkptr_t *) + &c->inddir[level][iblkid * sizeof(blkptr_t)]; + } + + /* + * Space for indirect blocks is allocated up-front; see the + * comment in objset_write(). + */ + loc = c->indloc; + c->indloc += blksz; + assert(c->indspace >= blksz); + c->indspace -= blksz; + + bp = buf; + fill = 0; + for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) + fill += BP_GET_FILL(&bp[i]); + + vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, + loc, pbp); + memset(buf, 0, MAXBLOCKSIZE); + + blkid /= BLKPTR_PER_INDIR; + } +} + +blkptr_t * +dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) +{ + off_t blkid, l1id; + int levels; + + if (c->dnode->dn_nlevels == 1) { + assert(off < MAXBLOCKSIZE); + return (&c->dnode->dn_blkptr[0]); + } + + assert(off % c->datablksz == 0); + + /* Do we need to flush any full indirect blocks? */ + if (off > 0) { + blkid = off / c->datablksz; + for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { + if (blkid % BLKPTR_PER_INDIR != 0) + break; + blkid /= BLKPTR_PER_INDIR; + } + if (levels > 0) + _dnode_cursor_flush(zfs, c, levels); + } + + c->dataoff = off; + l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); + return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); +} + +void +dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) +{ + int levels; + + levels = c->dnode->dn_nlevels - 1; + if (levels > 0) + _dnode_cursor_flush(zfs, c, levels); + assert(c->indspace == 0); + free(c); +} + +void +zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + int dirfd; + + zfs = fsopts->fs_specific; + + /* + * Use a fixed seed to provide reproducible pseudo-random numbers for + * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). + */ + srandom(1729); + + zfs_check_opts(fsopts); + + if (!zfs->nowarn) { + fprintf(stderr, + "ZFS support is currently considered experimental. " + "Do not use it for anything critical.\n"); + } + + dirfd = open(dir, O_DIRECTORY | O_RDONLY); + if (dirfd < 0) + err(1, "open(%s)", dir); + + vdev_init(zfs, image); + pool_init(zfs); + fs_build(zfs, dirfd, root); + pool_fini(zfs); + vdev_fini(zfs); +} diff --git a/usr.sbin/makefs/zfs/Makefile.inc b/usr.sbin/makefs/zfs/Makefile.inc new file mode 100644 index 000000000000..bebe8c322035 --- /dev/null +++ b/usr.sbin/makefs/zfs/Makefile.inc @@ -0,0 +1,12 @@ +.PATH: ${SRCDIR}/zfs +.PATH: ${SRCTOP}/stand/libsa/zfs + +SRCS+= dsl.c \ + fs.c \ + objset.c \ + vdev.c \ + zap.c + +SRCS+= nvlist.c + +CFLAGS.nvlist.c+= -I${SRCTOP}/stand/libsa -Wno-cast-qual diff --git a/usr.sbin/makefs/zfs/dsl.c b/usr.sbin/makefs/zfs/dsl.c new file mode 100644 index 000000000000..5f473e557c02 --- /dev/null +++ b/usr.sbin/makefs/zfs/dsl.c @@ -0,0 +1,598 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <assert.h> +#include <string.h> + +#include <util.h> + +#include "makefs.h" +#include "zfs.h" + +typedef struct zfs_dsl_dataset { + zfs_objset_t *os; /* referenced objset, may be null */ + dsl_dataset_phys_t *phys; /* on-disk representation */ + uint64_t dsid; /* DSL dataset dnode */ + + struct zfs_dsl_dir *dir; /* containing parent */ +} zfs_dsl_dataset_t; + +typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t; + +typedef struct zfs_dsl_dir { + char *fullname; /* full dataset name */ + char *name; /* basename(fullname) */ + dsl_dir_phys_t *phys; /* on-disk representation */ + nvlist_t *propsnv; /* properties saved in propszap */ + + zfs_dsl_dataset_t *headds; /* principal dataset, may be null */ + + uint64_t dirid; /* DSL directory dnode */ + zfs_zap_t *propszap; /* dataset properties */ + zfs_zap_t *childzap; /* child directories */ + + /* DSL directory tree linkage. */ + struct zfs_dsl_dir *parent; + zfs_dsl_dir_list_t children; + STAILQ_ENTRY(zfs_dsl_dir) next; +} zfs_dsl_dir_t; + +static zfs_dsl_dir_t *dsl_dir_alloc(zfs_opt_t *zfs, const char *name); +static zfs_dsl_dataset_t *dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir); + +static int +nvlist_find_string(nvlist_t *nvl, const char *key, char **retp) +{ + char *str; + int error, len; + + error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len); + if (error == 0) { + *retp = ecalloc(1, len + 1); + memcpy(*retp, str, len); + } + return (error); +} + +static int +nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp) +{ + return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL)); +} + +/* + * Return an allocated string containing the head dataset's mountpoint, + * including the root path prefix. + * + * If the dataset has a mountpoint property, it is returned. Otherwise we have + * to follow ZFS' inheritance rules. + */ +char * +dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) +{ + zfs_dsl_dir_t *pdir; + char *mountpoint, *origmountpoint; + + if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) { + if (strcmp(mountpoint, "none") == 0) + return (NULL); + + /* + * nvlist_find_string() does not make a copy. + */ + mountpoint = estrdup(mountpoint); + } else { + /* + * If we don't have a mountpoint, it's inherited from one of our + * ancestors. Walk up the hierarchy until we find it, building + * up our mountpoint along the way. The mountpoint property is + * always set for the root dataset. + */ + for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) { + origmountpoint = mountpoint; + + if (nvlist_find_string(pdir->propsnv, "mountpoint", + &mountpoint) == 0) { + easprintf(&mountpoint, "%s%s%s", mountpoint, + mountpoint[strlen(mountpoint) - 1] == '/' ? + "" : "/", origmountpoint); + free(origmountpoint); + break; + } + + easprintf(&mountpoint, "%s/%s", pdir->name, + origmountpoint); + free(origmountpoint); + pdir = pdir->parent; + } + } + assert(mountpoint[0] == '/'); + assert(strstr(mountpoint, zfs->rootpath) == mountpoint); + + return (mountpoint); +} + +int +dsl_dir_get_canmount(zfs_dsl_dir_t *dir, uint64_t *canmountp) +{ + return (nvlist_find_uint64(dir->propsnv, "canmount", canmountp)); +} + +/* + * Handle dataset properties that we know about; stash them into an nvlist to be + * written later to the properties ZAP object. + * + * If the set of properties we handle grows too much, we should probably explore + * using libzfs to manage them. + */ +static void +dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key, + const char *val) +{ + nvlist_t *nvl; + + nvl = dir->propsnv; + if (val == NULL || val[0] == '\0') + errx(1, "missing value for property `%s'", key); + if (nvpair_find(nvl, key) != NULL) + errx(1, "property `%s' already set", key); + + if (strcmp(key, "mountpoint") == 0) { + if (strcmp(val, "none") != 0) { + if (val[0] != '/') + errx(1, "mountpoint `%s' is not absolute", val); + if (strcmp(val, zfs->rootpath) != 0 && + strcmp(zfs->rootpath, "/") != 0 && + (strstr(val, zfs->rootpath) != val || + val[strlen(zfs->rootpath)] != '/')) { + errx(1, "mountpoint `%s' is not prefixed by " + "the root path `%s'", val, zfs->rootpath); + } + } + nvlist_add_string(nvl, key, val); + } else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 || + strcmp(key, "setuid") == 0) { + if (strcmp(val, "on") == 0) + nvlist_add_uint64(nvl, key, 1); + else if (strcmp(val, "off") == 0) + nvlist_add_uint64(nvl, key, 0); + else + errx(1, "invalid value `%s' for %s", val, key); + } else if (strcmp(key, "canmount") == 0) { + if (strcmp(val, "noauto") == 0) + nvlist_add_uint64(nvl, key, 2); + else if (strcmp(val, "on") == 0) + nvlist_add_uint64(nvl, key, 1); + else if (strcmp(val, "off") == 0) + nvlist_add_uint64(nvl, key, 0); + else + errx(1, "invalid value `%s' for %s", val, key); + } else { + errx(1, "unknown property `%s'", key); + } +} + +static zfs_dsl_dir_t * +dsl_metadir_alloc(zfs_opt_t *zfs, const char *name) +{ + zfs_dsl_dir_t *dir; + char *path; + + easprintf(&path, "%s/%s", zfs->poolname, name); + dir = dsl_dir_alloc(zfs, path); + free(path); + return (dir); +} + +static void +dsl_origindir_init(zfs_opt_t *zfs) +{ + dnode_phys_t *clones; + uint64_t clonesid; + + zfs->origindsldir = dsl_metadir_alloc(zfs, "$ORIGIN"); + zfs->originds = dsl_dataset_alloc(zfs, zfs->origindsldir); + zfs->snapds = dsl_dataset_alloc(zfs, zfs->origindsldir); + + clones = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_CLONES, &clonesid); + zfs->cloneszap = zap_alloc(zfs->mos, clones); + zfs->origindsldir->phys->dd_clones = clonesid; +} + +void +dsl_init(zfs_opt_t *zfs) +{ + zfs_dsl_dir_t *dir; + struct dataset_desc *d; + const char *dspropdelim; + + dspropdelim = ";"; + + zfs->rootdsldir = dsl_dir_alloc(zfs, NULL); + + nvlist_add_uint64(zfs->rootdsldir->propsnv, "compression", + ZIO_COMPRESS_OFF); + + zfs->rootds = dsl_dataset_alloc(zfs, zfs->rootdsldir); + zfs->rootdsldir->headds = zfs->rootds; + + zfs->mosdsldir = dsl_metadir_alloc(zfs, "$MOS"); + zfs->freedsldir = dsl_metadir_alloc(zfs, "$FREE"); + dsl_origindir_init(zfs); + + /* + * Go through the list of user-specified datasets and create DSL objects + * for them. + */ + STAILQ_FOREACH(d, &zfs->datasetdescs, next) { + char *dsname, *next, *params, *param, *nextparam; + + params = d->params; + dsname = strsep(¶ms, dspropdelim); + + if (strcmp(dsname, zfs->poolname) == 0) { + /* + * This is the root dataset; it's already created, so + * we're just setting options. + */ + dir = zfs->rootdsldir; + } else { + /* + * This dataset must be a child of the root dataset. + */ + if (strstr(dsname, zfs->poolname) != dsname || + (next = strchr(dsname, '/')) == NULL || + (size_t)(next - dsname) != strlen(zfs->poolname)) { + errx(1, "dataset `%s' must be a child of `%s'", + dsname, zfs->poolname); + } + dir = dsl_dir_alloc(zfs, dsname); + dir->headds = dsl_dataset_alloc(zfs, dir); + } + + for (nextparam = param = params; nextparam != NULL;) { + char *key, *val; + + param = strsep(&nextparam, dspropdelim); + + key = val = param; + key = strsep(&val, "="); + dsl_dir_set_prop(zfs, dir, key, val); + } + } + + /* + * Set the root dataset's mount point if the user didn't override the + * default. + */ + if (nvpair_find(zfs->rootdsldir->propsnv, "mountpoint") == NULL) { + nvlist_add_string(zfs->rootdsldir->propsnv, "mountpoint", + zfs->rootpath); + } +} + +uint64_t +dsl_dir_id(zfs_dsl_dir_t *dir) +{ + return (dir->dirid); +} + +uint64_t +dsl_dir_dataset_id(zfs_dsl_dir_t *dir) +{ + return (dir->headds->dsid); +} + +static void +dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, + void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) +{ + zfs_dsl_dir_t *cdsldir; + + STAILQ_FOREACH(cdsldir, &dsldir->children, next) { + dsl_dir_foreach_post(zfs, cdsldir, cb, arg); + } + cb(zfs, dsldir, arg); +} + +/* + * Used when the caller doesn't care about the order one way or another. + */ +void +dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, + void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) +{ + dsl_dir_foreach_post(zfs, dsldir, cb, arg); +} + +const char * +dsl_dir_fullname(const zfs_dsl_dir_t *dir) +{ + return (dir->fullname); +} + +/* + * Create a DSL directory, which is effectively an entry in the ZFS namespace. + * We always create a root DSL directory, whose name is the pool's name, and + * several metadata directories. + * + * Each directory has two ZAP objects, one pointing to child directories, and + * one for properties (which are inherited by children unless overridden). + * Directories typically reference a DSL dataset, the "head dataset", which + * points to an object set. + */ +static zfs_dsl_dir_t * +dsl_dir_alloc(zfs_opt_t *zfs, const char *name) +{ + zfs_dsl_dir_list_t l, *lp; + zfs_dsl_dir_t *dir, *parent; + dnode_phys_t *dnode; + char *dirname, *nextdir, *origname; + uint64_t childid, propsid; + + dir = ecalloc(1, sizeof(*dir)); + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DIR, + DMU_OT_DSL_DIR, sizeof(dsl_dir_phys_t), &dir->dirid); + dir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_PROPS, &propsid); + dir->propszap = zap_alloc(zfs->mos, dnode); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DIR_CHILD_MAP, + &childid); + dir->childzap = zap_alloc(zfs->mos, dnode); + + dir->propsnv = nvlist_create(NV_UNIQUE_NAME); + STAILQ_INIT(&dir->children); + + dir->phys->dd_child_dir_zapobj = childid; + dir->phys->dd_props_zapobj = propsid; + + if (name == NULL) { + /* + * This is the root DSL directory. + */ + dir->name = estrdup(zfs->poolname); + dir->fullname = estrdup(zfs->poolname); + dir->parent = NULL; + dir->phys->dd_parent_obj = 0; + + assert(zfs->rootdsldir == NULL); + zfs->rootdsldir = dir; + return (dir); + } + + /* + * Insert the new directory into the hierarchy. Currently this must be + * done in order, e.g., when creating pool/a/b, pool/a must already + * exist. + */ + STAILQ_INIT(&l); + STAILQ_INSERT_HEAD(&l, zfs->rootdsldir, next); + origname = dirname = nextdir = estrdup(name); + for (lp = &l;; lp = &parent->children) { + dirname = strsep(&nextdir, "/"); + if (nextdir == NULL) + break; + + STAILQ_FOREACH(parent, lp, next) { + if (strcmp(parent->name, dirname) == 0) + break; + } + if (parent == NULL) { + errx(1, "no parent at `%s' for filesystem `%s'", + dirname, name); + } + } + + dir->fullname = estrdup(name); + dir->name = estrdup(dirname); + free(origname); + STAILQ_INSERT_TAIL(lp, dir, next); + zap_add_uint64(parent->childzap, dir->name, dir->dirid); + + dir->parent = parent; + dir->phys->dd_parent_obj = parent->dirid; + return (dir); +} + +void +dsl_dir_size_set(zfs_dsl_dir_t *dir, uint64_t bytes) +{ + dir->phys->dd_used_bytes = bytes; + dir->phys->dd_compressed_bytes = bytes; + dir->phys->dd_uncompressed_bytes = bytes; +} + +/* + * Convert dataset properties into entries in the DSL directory's properties + * ZAP. + */ +static void +dsl_dir_finalize_props(zfs_dsl_dir_t *dir) +{ + for (nvp_header_t *nvh = NULL; + (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) { + nv_string_t *nvname; + nv_pair_data_t *nvdata; + const char *name; + + nvname = (nv_string_t *)(nvh + 1); + nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] + + NV_ALIGN4(nvname->nv_size)); + + name = nvstring_get(nvname); + switch (nvdata->nv_type) { + case DATA_TYPE_UINT64: { + uint64_t val; + + memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t)); + zap_add_uint64(dir->propszap, name, val); + break; + } + case DATA_TYPE_STRING: { + nv_string_t *nvstr; + + nvstr = (nv_string_t *)&nvdata->nv_data[0]; + zap_add_string(dir->propszap, name, + nvstring_get(nvstr)); + break; + } + default: + assert(0); + } + } +} + +static void +dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused) +{ + char key[32]; + zfs_dsl_dir_t *cdir; + dnode_phys_t *snapnames; + zfs_dsl_dataset_t *headds; + zfs_objset_t *os; + uint64_t bytes, snapnamesid; + + dsl_dir_finalize_props(dir); + zap_write(zfs, dir->propszap); + zap_write(zfs, dir->childzap); + + headds = dir->headds; + if (headds == NULL) + return; + os = headds->os; + if (os == NULL) + return; + + snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP, + &snapnamesid); + zap_write(zfs, zap_alloc(zfs->mos, snapnames)); + + dir->phys->dd_head_dataset_obj = headds->dsid; + dir->phys->dd_clone_parent_obj = zfs->snapds->dsid; + headds->phys->ds_prev_snap_obj = zfs->snapds->dsid; + headds->phys->ds_snapnames_zapobj = snapnamesid; + objset_root_blkptr_copy(os, &headds->phys->ds_bp); + + zfs->snapds->phys->ds_num_children++; + snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid); + zap_add_uint64(zfs->cloneszap, key, headds->dsid); + + bytes = objset_space(os); + headds->phys->ds_used_bytes = bytes; + headds->phys->ds_uncompressed_bytes = bytes; + headds->phys->ds_compressed_bytes = bytes; + + STAILQ_FOREACH(cdir, &dir->children, next) + bytes += cdir->phys->dd_used_bytes; + dsl_dir_size_set(dir, bytes); +} + +void +dsl_write(zfs_opt_t *zfs) +{ + zfs_zap_t *snapnameszap; + dnode_phys_t *snapnames; + uint64_t snapmapid; + + /* + * Perform accounting, starting from the leaves of the DSL directory + * tree. Accounting for $MOS is done later, once we've finished + * allocating space. + */ + dsl_dir_foreach_post(zfs, zfs->rootdsldir, dsl_dir_finalize, NULL); + + snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP, + &snapmapid); + snapnameszap = zap_alloc(zfs->mos, snapnames); + zap_add_uint64(snapnameszap, "$ORIGIN", zfs->snapds->dsid); + zap_write(zfs, snapnameszap); + + zfs->origindsldir->phys->dd_head_dataset_obj = zfs->originds->dsid; + zfs->originds->phys->ds_prev_snap_obj = zfs->snapds->dsid; + zfs->originds->phys->ds_snapnames_zapobj = snapmapid; + + zfs->snapds->phys->ds_next_snap_obj = zfs->originds->dsid; + assert(zfs->snapds->phys->ds_num_children > 0); + zfs->snapds->phys->ds_num_children++; + + zap_write(zfs, zfs->cloneszap); + + /* XXX-MJ dirs and datasets are leaked */ +} + +void +dsl_dir_dataset_write(zfs_opt_t *zfs, zfs_objset_t *os, zfs_dsl_dir_t *dir) +{ + dir->headds->os = os; + objset_write(zfs, os); +} + +bool +dsl_dir_has_dataset(zfs_dsl_dir_t *dir) +{ + return (dir->headds != NULL); +} + +bool +dsl_dir_dataset_has_objset(zfs_dsl_dir_t *dir) +{ + return (dsl_dir_has_dataset(dir) && dir->headds->os != NULL); +} + +static zfs_dsl_dataset_t * +dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) +{ + zfs_dsl_dataset_t *ds; + dnode_phys_t *dnode; + uint64_t deadlistid; + + ds = ecalloc(1, sizeof(*ds)); + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DATASET, + DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid); + ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode); + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DEADLIST, + DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); + + ds->phys->ds_dir_obj = dir->dirid; + ds->phys->ds_deadlist_obj = deadlistid; + ds->phys->ds_creation_txg = TXG - 1; + if (ds != zfs->snapds) + ds->phys->ds_prev_snap_txg = TXG - 1; + ds->phys->ds_guid = ((uint64_t)random() << 32) | random(); + ds->dir = dir; + + return (ds); +} diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c new file mode 100644 index 000000000000..15025ec5447d --- /dev/null +++ b/usr.sbin/makefs/zfs/fs.c @@ -0,0 +1,981 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/dirent.h> +#include <sys/stat.h> + +#include <assert.h> +#include <fcntl.h> +#include <string.h> +#include <unistd.h> + +#include <util.h> + +#include "makefs.h" +#include "zfs.h" + +typedef struct { + const char *name; + unsigned int id; + uint16_t size; + sa_bswap_type_t bs; +} zfs_sattr_t; + +typedef struct zfs_fs { + zfs_objset_t *os; + + /* Offset table for system attributes, indexed by a zpl_attr_t. */ + uint16_t *saoffs; + size_t sacnt; + const zfs_sattr_t *satab; +} zfs_fs_t; + +/* + * The order of the attributes doesn't matter, this is simply the one hard-coded + * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_DXATTR, + ZPL_PROJID, +} zpl_attr_t; + +/* + * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. + */ +static const zfs_sattr_t zpl_attrs[] = { +#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } + _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), + _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), +#undef ZPL_ATTR +}; + +/* + * This layout matches that of a filesystem created using OpenZFS on FreeBSD. + * It need not match in general, but FreeBSD's loader doesn't bother parsing the + * layout and just hard-codes attribute offsets. + */ +static const sa_attr_type_t zpl_attr_layout[] = { + ZPL_MODE, + ZPL_SIZE, + ZPL_GEN, + ZPL_UID, + ZPL_GID, + ZPL_PARENT, + ZPL_FLAGS, + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_LINKS, + ZPL_DACL_COUNT, + ZPL_DACL_ACES, + ZPL_SYMLINK, +}; + +/* + * Keys for the ZPL attribute tables in the SA layout ZAP. The first two + * indices are reserved for legacy attribute encoding. + */ +#define SA_LAYOUT_INDEX_DEFAULT 2 +#define SA_LAYOUT_INDEX_SYMLINK 3 + +struct fs_populate_dir { + SLIST_ENTRY(fs_populate_dir) next; + int dirfd; + uint64_t objid; + zfs_zap_t *zap; +}; + +struct fs_populate_arg { + zfs_opt_t *zfs; + zfs_fs_t *fs; /* owning filesystem */ + int dirfd; /* current directory fd */ + uint64_t rootdirid; /* root directory dnode ID */ + SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ +}; + +static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); + +static bool +fsnode_isroot(const fsnode *cur) +{ + return (strcmp(cur->name, ".") == 0); +} + +/* + * Visit each node in a directory hierarchy, in pre-order depth-first order. + */ +static void +fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) +{ + assert(root->type == S_IFDIR); + + for (fsnode *cur = root; cur != NULL; cur = cur->next) { + assert(cur->type == S_IFREG || cur->type == S_IFDIR || + cur->type == S_IFLNK); + + if (cb(cur, arg) == 0) + continue; + if (cur->type == S_IFDIR && cur->child != NULL) + fsnode_foreach(cur->child, cb, arg); + } +} + +static void +fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) +{ + struct fs_populate_dir *dir; + uint64_t type; + + switch (cur->type) { + case S_IFREG: + type = DT_REG; + break; + case S_IFDIR: + type = DT_DIR; + break; + case S_IFLNK: + type = DT_LNK; + break; + default: + assert(0); + } + + dir = SLIST_FIRST(&arg->dirs); + zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); +} + +static void +fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, + size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + + memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); + *szp += fs->satab[ind].size; +} + +static void +fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, + size_t valsz, size_t varoff, uint16_t ind, size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + assert(fs->satab[ind].size == 0); + + memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); + *szp += valsz; +} + +static void +fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, + dnode_phys_t *dnode) +{ + char target[PATH_MAX]; + zfs_fs_t *fs; + zfs_ace_hdr_t aces[3]; + struct stat *sb; + sa_hdr_phys_t *sahdr; + uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; + char *attrbuf; + size_t bonussz, hdrsz; + int layout; + + assert(dnode->dn_bonustype == DMU_OT_SA); + assert(dnode->dn_nblkptr == 1); + + fs = arg->fs; + sb = &cur->inode->st; + + switch (cur->type) { + case S_IFREG: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = cur->inode->nlink; + objsize = sb->st_size; + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFDIR: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = 1; /* .. */ + objsize = 1; /* .. */ + + /* + * The size of a ZPL directory is the number of entries + * (including "." and ".."), and the link count is the number of + * entries which are directories (including "." and ".."). + */ + for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; + c != NULL; c = c->next) { + if (c->type == S_IFDIR) + links++; + objsize++; + } + + /* The root directory is its own parent. */ + parent = SLIST_EMPTY(&arg->dirs) ? + arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFLNK: { + ssize_t n; + + if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, + target, sizeof(target) - 1)) == -1) + err(1, "readlinkat(%s)", cur->name); + target[n] = '\0'; + + layout = SA_LAYOUT_INDEX_SYMLINK; + links = 1; + objsize = strlen(target); + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + } + default: + assert(0); + } + + daclcount = nitems(aces); + flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED | + ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */ + gen = 1; + gid = sb->st_gid; + mode = sb->st_mode; + uid = sb->st_uid; + + memset(aces, 0, sizeof(aces)); + aces[0].z_flags = ACE_OWNER; + aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | + ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | + ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IRUSR) != 0) + aces[0].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWUSR) != 0) + aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXUSR) != 0) + aces[0].z_access_mask |= ACE_EXECUTE; + + aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; + aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IRGRP) != 0) + aces[1].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWGRP) != 0) + aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXGRP) != 0) + aces[1].z_access_mask |= ACE_EXECUTE; + + aces[2].z_flags = ACE_EVERYONE; + aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IROTH) != 0) + aces[2].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWOTH) != 0) + aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXOTH) != 0) + aces[2].z_access_mask |= ACE_EXECUTE; + + switch (layout) { + case SA_LAYOUT_INDEX_DEFAULT: + /* At most one variable-length attribute. */ + hdrsz = sizeof(uint64_t); + break; + case SA_LAYOUT_INDEX_SYMLINK: + /* At most five variable-length attributes. */ + hdrsz = sizeof(uint64_t) * 2; + break; + default: + assert(0); + } + + sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); + sahdr->sa_magic = SA_MAGIC; + SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); + + bonussz = SA_HDR_SIZE(sahdr); + attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); + + fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); + fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); + fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); + fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); + fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); + fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); + fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); + fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); + fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); + + /* + * We deliberately set atime = mtime here to ensure that images are + * reproducible. + */ + assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); + assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); + assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); + assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); + + fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, + ZPL_DACL_ACES, &bonussz); + sahdr->sa_lengths[0] = sizeof(aces); + + if (cur->type == S_IFLNK) { + assert(layout == SA_LAYOUT_INDEX_SYMLINK); + /* Need to use a spill block pointer if the target is long. */ + assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); + fs_populate_varszattr(fs, attrbuf, target, objsize, + sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); + sahdr->sa_lengths[1] = (uint16_t)objsize; + } + + dnode->dn_bonuslen = bonussz; +} + +static void +fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) +{ + struct dnode_cursor *c; + dnode_phys_t *dnode; + zfs_opt_t *zfs; + char *buf; + uint64_t dnid; + ssize_t n; + size_t bufsz; + off_t size, target; + int fd; + + assert(cur->type == S_IFREG); + assert((cur->inode->flags & FI_ROOT) == 0); + + zfs = arg->zfs; + + assert(cur->inode->ino != 0); + if ((cur->inode->flags & FI_ALLOCATED) != 0) { + /* + * This is a hard link of an existing file. + * + * XXX-MJ need to check whether it crosses datasets, add a test + * case for that + */ + fs_populate_dirent(arg, cur, cur->inode->ino); + return; + } + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + cur->inode->ino = dnid; + cur->inode->flags |= FI_ALLOCATED; + + fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY); + if (fd == -1) + err(1, "openat(%s)", cur->name); + + buf = zfs->filebuf; + bufsz = sizeof(zfs->filebuf); + size = cur->inode->st.st_size; + c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); + for (off_t foff = 0; foff < size; foff += target) { + off_t loc, sofar; + + /* + * Fill up our buffer, handling partial reads. + * + * It might be profitable to use copy_file_range(2) here. + */ + sofar = 0; + target = MIN(size - foff, (off_t)bufsz); + do { + n = read(fd, buf + sofar, target); + if (n < 0) + err(1, "reading from '%s'", cur->name); + if (n == 0) + errx(1, "unexpected EOF reading '%s'", + cur->name); + sofar += n; + } while (sofar < target); + + if (target < (off_t)bufsz) + memset(buf + target, 0, bufsz - target); + + loc = objset_space_alloc(zfs, arg->fs->os, &target); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc, + dnode_cursor_next(zfs, c, foff)); + } + if (close(fd) != 0) + err(1, "close"); + dnode_cursor_finish(zfs, c); + + fs_populate_sattrs(arg, cur, dnode); + fs_populate_dirent(arg, cur, dnid); +} + +static void +fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + zfs_objset_t *os; + uint64_t dnid; + int dirfd; + + assert(cur->type == S_IFDIR); + assert((cur->inode->flags & FI_ALLOCATED) == 0); + + os = arg->fs->os; + + dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_SA, 0, &dnid); + + /* + * Add an entry to the parent directory and open this directory. + */ + if (!SLIST_EMPTY(&arg->dirs)) { + fs_populate_dirent(arg, cur, dnid); + dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, + O_DIRECTORY); + if (dirfd < 0) + err(1, "open(%s)", cur->name); + } else { + arg->rootdirid = dnid; + dirfd = arg->dirfd; + } + + /* + * Set ZPL attributes. + */ + fs_populate_sattrs(arg, cur, dnode); + + /* + * If this is a root directory, then its children belong to a different + * dataset and this directory remains empty in the current objset. + */ + if ((cur->inode->flags & FI_ROOT) == 0) { + struct fs_populate_dir *dir; + + dir = ecalloc(1, sizeof(*dir)); + dir->dirfd = dirfd; + dir->objid = dnid; + dir->zap = zap_alloc(os, dnode); + SLIST_INSERT_HEAD(&arg->dirs, dir, next); + } else { + zap_write(arg->zfs, zap_alloc(os, dnode)); + fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); + } +} + +static void +fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + uint64_t dnid; + + assert(cur->type == S_IFLNK); + assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + + fs_populate_dirent(arg, cur, dnid); + + fs_populate_sattrs(arg, cur, dnode); +} + +static int +fs_foreach_populate(fsnode *cur, void *_arg) +{ + struct fs_populate_arg *arg; + struct fs_populate_dir *dir; + int ret; + + arg = _arg; + switch (cur->type) { + case S_IFREG: + fs_populate_file(cur, arg); + break; + case S_IFDIR: + if (fsnode_isroot(cur)) + break; + fs_populate_dir(cur, arg); + break; + case S_IFLNK: + fs_populate_symlink(cur, arg); + break; + default: + assert(0); + } + + ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; + + if (cur->next == NULL && + (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { + /* + * We reached a terminal node in a subtree. Walk back up and + * write out directories. We're done once we hit the root of a + * dataset or find a level where we're not on the edge of the + * tree. + */ + do { + dir = SLIST_FIRST(&arg->dirs); + SLIST_REMOVE_HEAD(&arg->dirs, next); + zap_write(arg->zfs, dir->zap); + if (dir->dirfd != -1 && close(dir->dirfd) != 0) + err(1, "close"); + free(dir); + cur = cur->parent; + } while (cur != NULL && cur->next == NULL && + (cur->inode->flags & FI_ROOT) == 0); + } + + return (ret); +} + +static void +fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, + const sa_attr_type_t layout[], size_t sacnt) +{ + char ti[16]; + + assert(sizeof(layout[0]) == 2); + + snprintf(ti, sizeof(ti), "%u", index); + zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, + (const uint8_t *)layout); +} + +/* + * Initialize system attribute tables. + * + * There are two elements to this. First, we write the zpl_attrs[] and + * zpl_attr_layout[] tables to disk. Then we create a lookup table which + * allows us to set file attributes quickly. + */ +static uint64_t +fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) +{ + zfs_zap_t *sazap, *salzap, *sarzap; + zfs_objset_t *os; + dnode_phys_t *saobj, *salobj, *sarobj; + uint64_t saobjid, salobjid, sarobjid; + uint16_t offset; + + os = fs->os; + + /* + * The on-disk tables are stored in two ZAP objects, the registry object + * and the layout object. Individual attributes are described by + * entries in the registry object; for example, the value for the + * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. + * The attributes of a file are ordered according to one of the layouts + * defined in the layout object. The master node object is simply used + * to locate the registry and layout objects. + */ + saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); + salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); + sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); + + sarzap = zap_alloc(os, sarobj); + for (size_t i = 0; i < nitems(zpl_attrs); i++) { + const zfs_sattr_t *sa; + uint64_t attr; + + attr = 0; + sa = &zpl_attrs[i]; + SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); + zap_add_uint64(sarzap, sa->name, attr); + } + zap_write(zfs, sarzap); + + /* + * Layouts are arrays of indices into the registry. We define two + * layouts for use by the ZPL, one for non-symlinks and one for + * symlinks. They are identical except that the symlink layout includes + * ZPL_SYMLINK as its final attribute. + */ + salzap = zap_alloc(os, salobj); + assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); + fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, + zpl_attr_layout, nitems(zpl_attr_layout) - 1); + fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, + zpl_attr_layout, nitems(zpl_attr_layout)); + zap_write(zfs, salzap); + + sazap = zap_alloc(os, saobj); + zap_add_uint64(sazap, SA_LAYOUTS, salobjid); + zap_add_uint64(sazap, SA_REGISTRY, sarobjid); + zap_write(zfs, sazap); + + /* Sanity check. */ + for (size_t i = 0; i < nitems(zpl_attrs); i++) + assert(i == zpl_attrs[i].id); + + /* + * Build the offset table used when setting file attributes. File + * attributes are stored in the object's bonus buffer; this table + * provides the buffer offset of attributes referenced by the layout + * table. + */ + fs->sacnt = nitems(zpl_attrs); + fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); + for (size_t i = 0; i < fs->sacnt; i++) + fs->saoffs[i] = 0xffff; + offset = 0; + for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { + uint16_t size; + + assert(zpl_attr_layout[i] < fs->sacnt); + + fs->saoffs[zpl_attr_layout[i]] = offset; + size = zpl_attrs[zpl_attr_layout[i]].size; + offset += size; + } + fs->satab = zpl_attrs; + + return (saobjid); +} + +static void +fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) +{ + char *mountpoint, *origmountpoint, *name, *next; + fsnode *cur, *root; + uint64_t canmount; + + if (!dsl_dir_has_dataset(dsldir)) + return; + + mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); + if (mountpoint == NULL) + return; + if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) + return; + + /* + * If we were asked to specify a bootfs, set it here. + */ + if (zfs->bootfs != NULL && strcmp(zfs->bootfs, + dsl_dir_fullname(dsldir)) == 0) { + zap_add_uint64(zfs->poolprops, "bootfs", + dsl_dir_dataset_id(dsldir)); + } + + origmountpoint = mountpoint; + + /* + * Figure out which fsnode corresponds to our mountpoint. + */ + root = arg; + cur = root; + if (strcmp(mountpoint, zfs->rootpath) != 0) { + mountpoint += strlen(zfs->rootpath); + + /* + * Look up the directory in the staged tree. For example, if + * the dataset's mount point is /foo/bar/baz, we'll search the + * root directory for "foo", search "foo" for "baz", and so on. + * Each intermediate name must refer to a directory; the final + * component need not exist. + */ + cur = root; + for (next = name = mountpoint; next != NULL;) { + for (; *next == '/'; next++) + ; + name = strsep(&next, "/"); + + for (; cur != NULL && strcmp(cur->name, name) != 0; + cur = cur->next) + ; + if (cur == NULL) { + if (next == NULL) + break; + errx(1, "missing mountpoint directory for `%s'", + dsl_dir_fullname(dsldir)); + } + if (cur->type != S_IFDIR) { + errx(1, + "mountpoint for `%s' is not a directory", + dsl_dir_fullname(dsldir)); + } + if (next != NULL) + cur = cur->child; + } + } + + if (cur != NULL) { + assert(cur->type == S_IFDIR); + + /* + * Multiple datasets shouldn't share a mountpoint. It's + * technically allowed, but it's not clear what makefs should do + * in that case. + */ + assert((cur->inode->flags & FI_ROOT) == 0); + if (cur != root) + cur->inode->flags |= FI_ROOT; + assert(cur->inode->param == NULL); + cur->inode->param = dsldir; + } + + free(origmountpoint); +} + +static int +fs_foreach_mark(fsnode *cur, void *arg) +{ + uint64_t *countp; + + countp = arg; + if (cur->type == S_IFDIR && fsnode_isroot(cur)) + return (1); + + if (cur->inode->ino == 0) { + cur->inode->ino = ++(*countp); + cur->inode->nlink = 1; + } else { + cur->inode->nlink++; + } + + return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); +} + +/* + * Create a filesystem dataset. More specifically: + * - create an object set for the dataset, + * - add required metadata (SA tables, property definitions, etc.) to that + * object set, + * - optionally populate the object set with file objects, using "root" as the + * root directory. + * + * "dirfd" is a directory descriptor for the directory referenced by "root". It + * is closed before returning. + */ +static void +fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) +{ + struct fs_populate_arg arg; + zfs_fs_t fs; + zfs_zap_t *masterzap; + zfs_objset_t *os; + dnode_phys_t *deleteq, *masterobj; + uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; + bool fakedroot; + + /* + * This dataset's mountpoint doesn't exist in the staging tree, or the + * dataset doesn't have a mountpoint at all. In either case we still + * need a root directory. Fake up a root fsnode to handle this case. + */ + fakedroot = root == NULL; + if (fakedroot) { + struct stat *stp; + + assert(dirfd == -1); + + root = ecalloc(1, sizeof(*root)); + root->inode = ecalloc(1, sizeof(*root->inode)); + root->name = estrdup("."); + root->type = S_IFDIR; + + stp = &root->inode->st; + stp->st_uid = 0; + stp->st_gid = 0; + stp->st_mode = S_IFDIR | 0755; + } + assert(root->type == S_IFDIR); + assert(fsnode_isroot(root)); + + /* + * Initialize the object set for this dataset. + */ + os = objset_alloc(zfs, DMU_OST_ZFS); + masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); + assert(moid == MASTER_NODE_OBJ); + + memset(&fs, 0, sizeof(fs)); + fs.os = os; + + /* + * Create the ZAP SA layout now since filesystem object dnodes will + * refer to those attributes. + */ + saobjid = fs_set_zpl_attrs(zfs, &fs); + + /* + * Make a pass over the staged directory to detect hard links and assign + * virtual dnode numbers. + */ + dnodecount = 1; /* root directory */ + fsnode_foreach(root, fs_foreach_mark, &dnodecount); + + /* + * Make a second pass to populate the dataset with files from the + * staged directory. Most of our runtime is spent here. + */ + arg.dirfd = dirfd; + arg.zfs = zfs; + arg.fs = &fs; + SLIST_INIT(&arg.dirs); + fs_populate_dir(root, &arg); + assert(!SLIST_EMPTY(&arg.dirs)); + fsnode_foreach(root, fs_foreach_populate, &arg); + assert(SLIST_EMPTY(&arg.dirs)); + rootdirid = arg.rootdirid; + + /* + * Create an empty delete queue. We don't do anything with it, but + * OpenZFS will refuse to mount filesystems that don't have one. + */ + deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); + zap_write(zfs, zap_alloc(os, deleteq)); + + /* + * Populate and write the master node object. This is a ZAP object + * containing various dataset properties and the object IDs of the root + * directory and delete queue. + */ + masterzap = zap_alloc(os, masterobj); + zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); + zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); + zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); + zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); + zap_add_uint64(masterzap, "normalization", 0 /* off */); + zap_add_uint64(masterzap, "utf8only", 0 /* off */); + zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); + zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); + zap_write(zfs, masterzap); + + /* + * All finished with this object set, we may as well write it now. + * The DSL layer will sum up the bytes consumed by each dataset using + * information stored in the object set, so it can't be freed just yet. + */ + dsl_dir_dataset_write(zfs, os, dsldir); + + if (fakedroot) { + free(root->inode); + free(root->name); + free(root); + } + free(fs.saoffs); +} + +/* + * Create an object set for each DSL directory which has a dataset and doesn't + * already have an object set. + */ +static void +fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) +{ + if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) + fs_build_one(zfs, dsldir, NULL, -1); +} + +/* + * Create our datasets and populate them with files. + */ +void +fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) +{ + /* + * Run through our datasets and find the root fsnode for each one. Each + * root fsnode is flagged so that we can figure out which dataset it + * belongs to. + */ + dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); + + /* + * Did we find our boot filesystem? + */ + if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) + errx(1, "no mounted dataset matches bootfs property `%s'", + zfs->bootfs); + + /* + * Traverse the file hierarchy starting from the root fsnode. One + * dataset, not necessarily the root dataset, must "own" the root + * directory by having its mountpoint be equal to the root path. + * + * As roots of other datasets are encountered during the traversal, + * fs_build_one() recursively creates the corresponding object sets and + * populates them. Once this function has returned, all datasets will + * have been fully populated. + */ + fs_build_one(zfs, root->inode->param, root, dirfd); + + /* + * Now create object sets for datasets whose mountpoints weren't found + * in the staging directory, either because there is no mountpoint, or + * because the mountpoint doesn't correspond to an existing directory. + */ + dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); +} diff --git a/usr.sbin/makefs/zfs/objset.c b/usr.sbin/makefs/zfs/objset.c new file mode 100644 index 000000000000..fdb17167a607 --- /dev/null +++ b/usr.sbin/makefs/zfs/objset.c @@ -0,0 +1,259 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <assert.h> +#include <string.h> + +#include <util.h> + +#include "zfs.h" + +#define DNODES_PER_CHUNK (MAXBLOCKSIZE / sizeof(dnode_phys_t)) + +struct objset_dnode_chunk { + dnode_phys_t buf[DNODES_PER_CHUNK]; + unsigned int nextfree; + STAILQ_ENTRY(objset_dnode_chunk) next; +}; + +typedef struct zfs_objset { + /* Physical object set. */ + objset_phys_t *phys; + off_t osloc; + off_t osblksz; + blkptr_t osbp; /* set in objset_write() */ + + /* Accounting. */ + off_t space; /* bytes allocated to this objset */ + + /* dnode allocator. */ + uint64_t dnodecount; + STAILQ_HEAD(, objset_dnode_chunk) dnodechunks; +} zfs_objset_t; + +static void +dnode_init(dnode_phys_t *dnode, uint8_t type, uint8_t bonustype, + uint16_t bonuslen) +{ + dnode->dn_indblkshift = MAXBLOCKSHIFT; + dnode->dn_type = type; + dnode->dn_bonustype = bonustype; + dnode->dn_bonuslen = bonuslen; + dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4; + dnode->dn_nlevels = 1; + dnode->dn_nblkptr = 1; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; +} + +zfs_objset_t * +objset_alloc(zfs_opt_t *zfs, uint64_t type) +{ + struct objset_dnode_chunk *chunk; + zfs_objset_t *os; + + os = ecalloc(1, sizeof(*os)); + os->osblksz = sizeof(objset_phys_t); + os->osloc = objset_space_alloc(zfs, os, &os->osblksz); + + /* + * Object ID zero is always reserved for the meta dnode, which is + * embedded in the objset itself. + */ + STAILQ_INIT(&os->dnodechunks); + chunk = ecalloc(1, sizeof(*chunk)); + chunk->nextfree = 1; + STAILQ_INSERT_HEAD(&os->dnodechunks, chunk, next); + os->dnodecount = 1; + + os->phys = ecalloc(1, os->osblksz); + os->phys->os_type = type; + + dnode_init(&os->phys->os_meta_dnode, DMU_OT_DNODE, DMU_OT_NONE, 0); + os->phys->os_meta_dnode.dn_datablkszsec = + DNODE_BLOCK_SIZE >> MINBLOCKSHIFT; + + return (os); +} + +/* + * Write the dnode array and physical object set to disk. + */ +static void +_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c, + off_t loc) +{ + struct objset_dnode_chunk *chunk, *tmp; + unsigned int total; + + /* + * Write out the dnode array, i.e., the meta-dnode. For some reason its + * data blocks must be 16KB in size no matter how large the array is. + */ + total = 0; + STAILQ_FOREACH_SAFE(chunk, &os->dnodechunks, next, tmp) { + unsigned int i; + + assert(chunk->nextfree <= os->dnodecount); + assert(chunk->nextfree <= DNODES_PER_CHUNK); + + for (i = 0; i < chunk->nextfree; i += DNODES_PER_BLOCK) { + blkptr_t *bp; + uint64_t fill; + + if (chunk->nextfree - i < DNODES_PER_BLOCK) + fill = DNODES_PER_BLOCK - (chunk->nextfree - i); + else + fill = 0; + bp = dnode_cursor_next(zfs, c, + (total + i) * sizeof(dnode_phys_t)); + vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode, + 0, fill, chunk->buf + i, DNODE_BLOCK_SIZE, loc, bp); + loc += DNODE_BLOCK_SIZE; + } + total += i; + + free(chunk); + } + dnode_cursor_finish(zfs, c); + STAILQ_INIT(&os->dnodechunks); + + /* + * Write the object set itself. The saved block pointer will be copied + * into the referencing DSL dataset or the uberblocks. + */ + vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1, + os->phys, os->osblksz, os->osloc, &os->osbp); +} + +void +objset_write(zfs_opt_t *zfs, zfs_objset_t *os) +{ + struct dnode_cursor *c; + off_t dnodeloc, dnodesz; + uint64_t dnodecount; + + /* + * There is a chicken-and-egg problem here when writing the MOS: we + * cannot write space maps before we're finished allocating space from + * the vdev, and we can't write the MOS without having allocated space + * for indirect dnode blocks. Thus, rather than lazily allocating + * indirect blocks for the meta-dnode (which would be simpler), they are + * allocated up-front and before writing space maps. + */ + dnodecount = os->dnodecount; + if (os == zfs->mos) + dnodecount += zfs->mscount; + dnodesz = dnodecount * sizeof(dnode_phys_t); + c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, dnodesz, + DNODE_BLOCK_SIZE); + dnodesz = roundup2(dnodesz, DNODE_BLOCK_SIZE); + dnodeloc = objset_space_alloc(zfs, os, &dnodesz); + + if (os == zfs->mos) { + vdev_spacemap_write(zfs); + + /* + * We've finished allocating space, account for it in $MOS. + */ + dsl_dir_size_set(zfs->mosdsldir, os->space); + } + _objset_write(zfs, os, c, dnodeloc); +} + +dnode_phys_t * +objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype, + uint16_t bonuslen, uint64_t *idp) +{ + struct objset_dnode_chunk *chunk; + dnode_phys_t *dnode; + + assert(bonuslen <= DN_OLD_MAX_BONUSLEN); + assert(!STAILQ_EMPTY(&os->dnodechunks)); + + chunk = STAILQ_LAST(&os->dnodechunks, objset_dnode_chunk, next); + if (chunk->nextfree == DNODES_PER_CHUNK) { + chunk = ecalloc(1, sizeof(*chunk)); + STAILQ_INSERT_TAIL(&os->dnodechunks, chunk, next); + } + *idp = os->dnodecount++; + dnode = &chunk->buf[chunk->nextfree++]; + dnode_init(dnode, type, bonustype, bonuslen); + dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT; + return (dnode); +} + +dnode_phys_t * +objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp) +{ + return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp)); +} + +/* + * Look up a physical dnode by ID. This is not used often so a linear search is + * fine. + */ +dnode_phys_t * +objset_dnode_lookup(zfs_objset_t *os, uint64_t id) +{ + struct objset_dnode_chunk *chunk; + + assert(id > 0); + assert(id < os->dnodecount); + + STAILQ_FOREACH(chunk, &os->dnodechunks, next) { + if (id < DNODES_PER_CHUNK) + return (&chunk->buf[id]); + id -= DNODES_PER_CHUNK; + } + assert(0); + return (NULL); +} + +off_t +objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp) +{ + off_t loc; + + loc = vdev_space_alloc(zfs, lenp); + os->space += *lenp; + return (loc); +} + +uint64_t +objset_space(const zfs_objset_t *os) +{ + return (os->space); +} + +void +objset_root_blkptr_copy(const zfs_objset_t *os, blkptr_t *bp) +{ + memcpy(bp, &os->osbp, sizeof(blkptr_t)); +} diff --git a/usr.sbin/makefs/zfs/vdev.c b/usr.sbin/makefs/zfs/vdev.c new file mode 100644 index 000000000000..1709a828b7c5 --- /dev/null +++ b/usr.sbin/makefs/zfs/vdev.c @@ -0,0 +1,435 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <assert.h> +#include <fcntl.h> +#include <string.h> +#include <unistd.h> + +#include <util.h> + +#include "zfs.h" + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#include "zfs/fletcher.c" +#include "zfs/sha256.c" +#pragma clang diagnostic pop + +static void +blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, + uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) +{ + dva_t *dva; + + assert(powerof2(size)); + + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_CHECKSUM(bp, cksumt); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BP_SET_BIRTH(bp, TXG, TXG); + BP_SET_LEVEL(bp, level); + BP_SET_FILL(bp, fill); + BP_SET_TYPE(bp, dntype); + + dva = BP_IDENTITY(bp); + DVA_SET_VDEV(dva, 0); + DVA_SET_OFFSET(dva, off); + DVA_SET_ASIZE(dva, size); + memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); +} + +/* + * Write a block of data to the vdev. The offset is always relative to the end + * of the second leading vdev label. + * + * Consumers should generally use the helpers below, which provide block + * pointers and update dnode accounting, rather than calling this function + * directly. + */ +static void +vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) +{ + ssize_t n; + + assert(off >= 0 && off < zfs->asize); + assert(powerof2(len)); + assert((off_t)len > 0 && off + (off_t)len > off && + off + (off_t)len < zfs->asize); + if (zfs->spacemap != NULL) { + /* + * Verify that the blocks being written were in fact allocated. + * + * The space map isn't available once the on-disk space map is + * finalized, so this check doesn't quite catch everything. + */ + assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, + (off + len - 1) >> zfs->ashift, 1)); + } + + off += VDEV_LABEL_START_SIZE; + for (size_t sofar = 0; sofar < len; sofar += n) { + n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, + off + sofar); + if (n < 0) + err(1, "pwrite"); + assert(n > 0); + } +} + +void +vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, + uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, + blkptr_t *bp) +{ + zio_cksum_t cksum; + + assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); + + fletcher_4_native(data, sz, NULL, &cksum); + blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); + vdev_pwrite(zfs, data, sz, loc); +} + +void +vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, + uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) +{ + vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, + data, sz, loc, bp); + + assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); + dnode->dn_used += sz; +} + +void +vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, + off_t sz, off_t loc) +{ + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, + &dnode->dn_blkptr[0]); +} + +static void +vdev_label_set_checksum(void *buf, off_t off, off_t size) +{ + zio_cksum_t cksum; + zio_eck_t *eck; + + assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); + + eck = (zio_eck_t *)((char *)buf + size) - 1; + eck->zec_magic = ZEC_MAGIC; + ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); + zio_checksum_SHA256(buf, size, NULL, &cksum); + eck->zec_cksum = cksum; +} + +/* + * Set embedded checksums and write the label at the specified index. + */ +void +vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) +{ + vdev_label_t *label; + ssize_t n; + off_t blksz, loff; + + assert(ind >= 0 && ind < VDEV_LABELS); + + /* + * Make a copy since we have to modify the label to set checksums. + */ + label = ecalloc(1, sizeof(*label)); + memcpy(label, labelp, sizeof(*label)); + + if (ind < 2) + loff = ind * sizeof(*label); + else + loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); + + /* + * Set the verifier checksum for the boot block. We don't use it, but + * the FreeBSD loader reads it and will complain if the checksum isn't + * valid. + */ + vdev_label_set_checksum(&label->vl_be, + loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); + + /* + * Set the verifier checksum for the label. + */ + vdev_label_set_checksum(&label->vl_vdev_phys, + loff + __offsetof(vdev_label_t, vl_vdev_phys), + sizeof(label->vl_vdev_phys)); + + /* + * Set the verifier checksum for the uberblocks. There is one uberblock + * per sector; for example, with an ashift of 12 we end up with + * 128KB/4KB=32 copies of the uberblock in the ring. + */ + blksz = 1 << zfs->ashift; + assert(sizeof(label->vl_uberblock) % blksz == 0); + for (size_t roff = 0; roff < sizeof(label->vl_uberblock); + roff += blksz) { + vdev_label_set_checksum(&label->vl_uberblock[0] + roff, + loff + __offsetof(vdev_label_t, vl_uberblock) + roff, + blksz); + } + + n = pwrite(zfs->fd, label, sizeof(*label), loff); + if (n < 0) + err(1, "writing vdev label"); + assert(n == sizeof(*label)); + + free(label); +} + +/* + * Find a chunk of contiguous free space of length *lenp, according to the + * following rules: + * 1. If the length is less than or equal to 128KB, the returned run's length + * will be the smallest power of 2 equal to or larger than the length. + * 2. If the length is larger than 128KB, the returned run's length will be + * the smallest multiple of 128KB that is larger than the length. + * 3. The returned run's length will be size-aligned up to 128KB. + * + * XXX-MJ the third rule isn't actually required, so this can just be a dumb + * bump allocator. Maybe there's some benefit to keeping large blocks aligned, + * so let's keep it for now and hope we don't get too much fragmentation. + * Alternately we could try to allocate all blocks of a certain size from the + * same metaslab. + */ +off_t +vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) +{ + off_t len; + int align, loc, minblksz, nbits; + + minblksz = 1 << zfs->ashift; + len = roundup2(*lenp, minblksz); + + assert(len != 0); + assert(len / minblksz <= INT_MAX); + + if (len < MAXBLOCKSIZE) { + if ((len & (len - 1)) != 0) + len = (off_t)1 << flsll(len); + align = len / minblksz; + } else { + len = roundup2(len, MAXBLOCKSIZE); + align = MAXBLOCKSIZE / minblksz; + } + + for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { + bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, + &loc); + if (loc == -1) { + errx(1, "failed to find %ju bytes of space", + (uintmax_t)len); + } + if ((loc & (align - 1)) == 0) + break; + } + assert(loc + nbits > loc); + bit_nset(zfs->spacemap, loc, loc + nbits - 1); + *lenp = len; + + return ((off_t)loc << zfs->ashift); +} + +static void +vdev_spacemap_init(zfs_opt_t *zfs) +{ + uint64_t nbits; + + assert(powerof2(zfs->mssize)); + + nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift; + if (nbits > INT_MAX) { + /* + * With the smallest block size of 512B, the limit on the image + * size is 2TB. That should be enough for anyone. + */ + errx(1, "image size is too large"); + } + zfs->spacemapbits = (int)nbits; + zfs->spacemap = bit_alloc(zfs->spacemapbits); + if (zfs->spacemap == NULL) + err(1, "bitstring allocation failed"); +} + +void +vdev_spacemap_write(zfs_opt_t *zfs) +{ + dnode_phys_t *objarr; + bitstr_t *spacemap; + uint64_t *objarrblk; + off_t smblksz, objarrblksz, objarrloc; + + struct { + dnode_phys_t *dnode; + uint64_t dnid; + off_t loc; + } *sma; + + objarrblksz = sizeof(uint64_t) * zfs->mscount; + assert(objarrblksz <= MAXBLOCKSIZE); + objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz); + objarrblk = ecalloc(1, objarrblksz); + + objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid); + objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; + + /* + * Use the smallest block size for space maps. The space allocation + * algorithm should aim to minimize the number of holes. + */ + smblksz = 1 << zfs->ashift; + + /* + * First allocate dnodes and space for all of our space maps. No more + * space can be allocated from the vdev after this point. + */ + sma = ecalloc(zfs->mscount, sizeof(*sma)); + for (uint64_t i = 0; i < zfs->mscount; i++) { + sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos, + DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER, + sizeof(space_map_phys_t), &sma[i].dnid); + sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz); + } + spacemap = zfs->spacemap; + zfs->spacemap = NULL; + + /* + * Now that the set of allocated space is finalized, populate each space + * map and write it to the vdev. + */ + for (uint64_t i = 0; i < zfs->mscount; i++) { + space_map_phys_t *sm; + uint64_t alloc, length, *smblk; + int shift, startb, endb, srunb, erunb; + + /* + * We only allocate a single block for this space map, but + * OpenZFS assumes that a space map object with sufficient bonus + * space supports histograms. + */ + sma[i].dnode->dn_nblkptr = 3; + sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; + + smblk = ecalloc(1, smblksz); + + alloc = length = 0; + shift = zfs->msshift - zfs->ashift; + for (srunb = startb = i * (1 << shift), + endb = (i + 1) * (1 << shift); + srunb < endb; srunb = erunb) { + uint64_t runlen, runoff; + + /* Find a run of allocated space. */ + bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); + if (srunb == -1 || srunb >= endb) + break; + + bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); + if (erunb == -1 || erunb > endb) + erunb = endb; + + /* + * The space represented by [srunb, erunb) has been + * allocated. Add a record to the space map to indicate + * this. Run offsets are relative to the beginning of + * the metaslab. + */ + runlen = erunb - srunb; + runoff = srunb - startb; + + assert(length * sizeof(uint64_t) < (uint64_t)smblksz); + smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); + smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | + SM2_OFFSET_ENCODE(runoff); + + alloc += runlen << zfs->ashift; + length += 2; + } + + sm = DN_BONUS(sma[i].dnode); + sm->smp_length = length * sizeof(uint64_t); + sm->smp_alloc = alloc; + + vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, + sma[i].loc); + free(smblk); + + /* Record this space map in the space map object array. */ + objarrblk[i] = sma[i].dnid; + } + + /* + * All of the space maps are written, now write the object array. + */ + vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); + free(objarrblk); + + assert(zfs->spacemap == NULL); + free(spacemap); + free(sma); +} + +void +vdev_init(zfs_opt_t *zfs, const char *image) +{ + assert(zfs->ashift >= MINBLOCKSHIFT); + + zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (zfs->fd == -1) + err(1, "Can't open `%s' for writing", image); + if (ftruncate(zfs->fd, zfs->vdevsize) != 0) + err(1, "Failed to extend image file `%s'", image); + + vdev_spacemap_init(zfs); +} + +void +vdev_fini(zfs_opt_t *zfs) +{ + assert(zfs->spacemap == NULL); + + if (zfs->fd != -1) { + if (close(zfs->fd) != 0) + err(1, "close"); + zfs->fd = -1; + } +} diff --git a/usr.sbin/makefs/zfs/zap.c b/usr.sbin/makefs/zfs/zap.c new file mode 100644 index 000000000000..398c0fbf029c --- /dev/null +++ b/usr.sbin/makefs/zfs/zap.c @@ -0,0 +1,551 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/endian.h> + +#include <assert.h> +#include <stddef.h> +#include <string.h> + +#include <util.h> + +#include "makefs.h" +#include "zfs.h" + +typedef struct zfs_zap_entry { + char *name; /* entry key, private copy */ + uint64_t hash; /* key hash */ + union { + uint8_t *valp; + uint16_t *val16p; + uint32_t *val32p; + uint64_t *val64p; + }; /* entry value, an integer array */ + uint64_t val64; /* embedded value for a common case */ + size_t intsz; /* array element size; 1, 2, 4 or 8 */ + size_t intcnt; /* array size */ + STAILQ_ENTRY(zfs_zap_entry) next; +} zfs_zap_entry_t; + +struct zfs_zap { + STAILQ_HEAD(, zfs_zap_entry) kvps; + uint64_t hashsalt; /* key hash input */ + unsigned long kvpcnt; /* number of key-value pairs */ + unsigned long chunks; /* count of chunks needed for fat ZAP */ + bool micro; /* can this be a micro ZAP? */ + + dnode_phys_t *dnode; /* backpointer */ + zfs_objset_t *os; /* backpointer */ +}; + +static uint16_t +zap_entry_chunks(zfs_zap_entry_t *ent) +{ + return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + + howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); +} + +static uint64_t +zap_hash(uint64_t salt, const char *name) +{ + static uint64_t crc64_table[256]; + const uint64_t crc64_poly = 0xC96C5795D7870F42UL; + const uint8_t *cp; + uint64_t crc; + uint8_t c; + + assert(salt != 0); + if (crc64_table[128] == 0) { + for (int i = 0; i < 256; i++) { + uint64_t *t; + + t = crc64_table + i; + *t = i; + for (int j = 8; j > 0; j--) + *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); + } + } + assert(crc64_table[128] == crc64_poly); + + for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the ones that we first pay attention to when + * choosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + +zfs_zap_t * +zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode) +{ + zfs_zap_t *zap; + + zap = ecalloc(1, sizeof(*zap)); + STAILQ_INIT(&zap->kvps); + zap->hashsalt = ((uint64_t)random() << 32) | random(); + zap->micro = true; + zap->kvpcnt = 0; + zap->chunks = 0; + zap->dnode = dnode; + zap->os = os; + return (zap); +} + +void +zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, + const uint8_t *val) +{ + zfs_zap_entry_t *ent; + + assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); + assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); + assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); + + ent = ecalloc(1, sizeof(*ent)); + ent->name = estrdup(name); + ent->hash = zap_hash(zap->hashsalt, ent->name); + ent->intsz = intsz; + ent->intcnt = intcnt; + if (intsz == sizeof(uint64_t) && intcnt == 1) { + /* + * Micro-optimization to elide a memory allocation in that most + * common case where this is a directory entry. + */ + ent->val64p = &ent->val64; + } else { + ent->valp = ecalloc(intcnt, intsz); + } + memcpy(ent->valp, val, intcnt * intsz); + zap->kvpcnt++; + zap->chunks += zap_entry_chunks(ent); + STAILQ_INSERT_TAIL(&zap->kvps, ent, next); + + if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || + strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) + zap->micro = false; +} + +void +zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) +{ + zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); +} + +void +zap_add_string(zfs_zap_t *zap, const char *name, const char *val) +{ + zap_add(zap, name, 1, strlen(val) + 1, val); +} + +bool +zap_entry_exists(zfs_zap_t *zap, const char *name) +{ + zfs_zap_entry_t *ent; + + STAILQ_FOREACH(ent, &zap->kvps, next) { + if (strcmp(ent->name, name) == 0) + return (true); + } + return (false); +} + +static void +zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + dnode_phys_t *dnode; + zfs_zap_entry_t *ent; + mzap_phys_t *mzap; + mzap_ent_phys_t *ment; + off_t bytes, loc; + + memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); + mzap = (mzap_phys_t *)&zfs->filebuf[0]; + mzap->mz_block_type = ZBT_MICRO; + mzap->mz_salt = zap->hashsalt; + mzap->mz_normflags = 0; + + bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); + assert(bytes <= (off_t)MZAP_MAX_BLKSZ); + + ment = &mzap->mz_chunk[0]; + STAILQ_FOREACH(ent, &zap->kvps, next) { + memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); + ment->mze_cd = 0; /* XXX-MJ */ + strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name)); + ment++; + } + + loc = objset_space_alloc(zfs, zap->os, &bytes); + + dnode = zap->dnode; + dnode->dn_maxblkid = 0; + dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + + vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); +} + +/* + * Write some data to the fat ZAP leaf chunk starting at index "li". + * + * Note that individual integers in the value may be split among consecutive + * leaves. + */ +static void +zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, + const uint8_t *val) +{ + struct zap_leaf_array *la; + + assert(sz <= ZAP_MAXVALUELEN); + + for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { + n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); + + la = &ZAP_LEAF_CHUNK(l, li).l_array; + assert(la->la_type == ZAP_CHUNK_FREE); + la->la_type = ZAP_CHUNK_ARRAY; + memcpy(la->la_array, val, n); + la->la_next = li + 1; + } + la->la_next = 0xffff; +} + +/* + * Find the shortest hash prefix length which lets us distribute keys without + * overflowing a leaf block. This is not (space) optimal, but is simple, and + * directories large enough to overflow a single 128KB leaf block are uncommon. + */ +static unsigned int +zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) +{ + zfs_zap_entry_t *ent; + unsigned int prefixlen; + + if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { + /* + * All chunks will fit in a single leaf block. + */ + return (0); + } + + for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { + uint32_t *leafchunks; + + leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); + STAILQ_FOREACH(ent, &zap->kvps, next) { + uint64_t li; + uint16_t chunks; + + li = ZAP_HASH_IDX(ent->hash, prefixlen); + + chunks = zap_entry_chunks(ent); + if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { + /* + * Not enough space, grow the prefix and retry. + */ + break; + } + leafchunks[li] += chunks; + } + free(leafchunks); + + if (ent == NULL) { + /* + * Everything fits, we're done. + */ + break; + } + } + + /* + * If this fails, then we need to expand the pointer table. For now + * this situation is unhandled since it is hard to trigger. + */ + assert(prefixlen < (unsigned int)l->l_bs); + + return (prefixlen); +} + +/* + * Initialize a fat ZAP leaf block. + */ +static void +zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) +{ + zap_leaf_phys_t *leaf; + + leaf = l->l_phys; + + leaf->l_hdr.lh_block_type = ZBT_LEAF; + leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + leaf->l_hdr.lh_prefix = prefix; + leaf->l_hdr.lh_prefix_len = prefixlen; + + /* Initialize the leaf hash table. */ + assert(leaf->l_hdr.lh_nfree < 0xffff); + memset(leaf->l_hash, 0xff, + ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); + + /* Initialize the leaf chunks. */ + for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_free *lf; + + lf = &ZAP_LEAF_CHUNK(l, i).l_free; + lf->lf_type = ZAP_CHUNK_FREE; + if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) + lf->lf_next = 0xffff; + else + lf->lf_next = i + 1; + } +} + +static void +zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + struct dnode_cursor *c; + zap_leaf_t l; + zap_phys_t *zaphdr; + struct zap_table_phys *zt; + zfs_zap_entry_t *ent; + dnode_phys_t *dnode; + uint8_t *leafblks; + uint64_t lblkcnt, *ptrhasht; + off_t loc, blksz; + size_t blkshift; + unsigned int prefixlen; + int ptrcnt; + + /* + * For simplicity, always use the largest block size. This should be ok + * since most directories will be micro ZAPs, but it's space inefficient + * for small ZAPs and might need to be revisited. + */ + blkshift = MAXBLOCKSHIFT; + blksz = (off_t)1 << blkshift; + + /* + * Embedded pointer tables give up to 8192 entries. This ought to be + * enough for anything except massive directories. + */ + ptrcnt = (blksz / 2) / sizeof(uint64_t); + + memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); + zaphdr = (zap_phys_t *)&zfs->filebuf[0]; + zaphdr->zap_block_type = ZBT_HEADER; + zaphdr->zap_magic = ZAP_MAGIC; + zaphdr->zap_num_entries = zap->kvpcnt; + zaphdr->zap_salt = zap->hashsalt; + + l.l_bs = blkshift; + l.l_phys = NULL; + + zt = &zaphdr->zap_ptrtbl; + zt->zt_blk = 0; + zt->zt_numblks = 0; + zt->zt_shift = flsll(ptrcnt) - 1; + zt->zt_nextblk = 0; + zt->zt_blks_copied = 0; + + /* + * How many leaf blocks do we need? Initialize them and update the + * header. + */ + prefixlen = zap_fat_write_prefixlen(zap, &l); + lblkcnt = 1 << prefixlen; + leafblks = ecalloc(lblkcnt, blksz); + for (unsigned int li = 0; li < lblkcnt; li++) { + l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + zap_fat_write_leaf_init(&l, li, prefixlen); + } + zaphdr->zap_num_leafs = lblkcnt; + zaphdr->zap_freeblk = lblkcnt + 1; + + /* + * For each entry, figure out which leaf block it belongs to based on + * the upper bits of its hash, allocate chunks from that leaf, and fill + * them out. + */ + ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); + STAILQ_FOREACH(ent, &zap->kvps, next) { + struct zap_leaf_entry *le; + uint16_t *lptr; + uint64_t hi, li; + uint16_t namelen, nchunks, nnamechunks, nvalchunks; + + hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); + li = ZAP_HASH_IDX(ent->hash, prefixlen); + assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); + ptrhasht[hi] = li + 1; + l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + + namelen = strlen(ent->name) + 1; + + /* + * How many leaf chunks do we need for this entry? + */ + nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); + nvalchunks = howmany(ent->intcnt, + ZAP_LEAF_ARRAY_BYTES / ent->intsz); + nchunks = 1 + nnamechunks + nvalchunks; + + /* + * Allocate a run of free leaf chunks for this entry, + * potentially extending a hash chain. + */ + assert(l.l_phys->l_hdr.lh_nfree >= nchunks); + l.l_phys->l_hdr.lh_nfree -= nchunks; + l.l_phys->l_hdr.lh_nentries++; + lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); + while (*lptr != 0xffff) { + assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); + le = ZAP_LEAF_ENTRY(&l, *lptr); + assert(le->le_type == ZAP_CHUNK_ENTRY); + le->le_cd++; + lptr = &le->le_next; + } + *lptr = l.l_phys->l_hdr.lh_freelist; + l.l_phys->l_hdr.lh_freelist += nchunks; + assert(l.l_phys->l_hdr.lh_freelist <= + ZAP_LEAF_NUMCHUNKS(&l)); + if (l.l_phys->l_hdr.lh_freelist == + ZAP_LEAF_NUMCHUNKS(&l)) + l.l_phys->l_hdr.lh_freelist = 0xffff; + + /* + * Integer values must be stored in big-endian format. + */ + switch (ent->intsz) { + case 1: + break; + case 2: + for (uint16_t *v = ent->val16p; + v - ent->val16p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe16(*v); + break; + case 4: + for (uint32_t *v = ent->val32p; + v - ent->val32p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe32(*v); + break; + case 8: + for (uint64_t *v = ent->val64p; + v - ent->val64p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe64(*v); + break; + default: + assert(0); + } + + /* + * Finally, write out the leaf chunks for this entry. + */ + le = ZAP_LEAF_ENTRY(&l, *lptr); + assert(le->le_type == ZAP_CHUNK_FREE); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_next = 0xffff; + le->le_name_chunk = *lptr + 1; + le->le_name_numints = namelen; + le->le_value_chunk = *lptr + 1 + nnamechunks; + le->le_value_intlen = ent->intsz; + le->le_value_numints = ent->intcnt; + le->le_hash = ent->hash; + zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name); + zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, + ent->intcnt * ent->intsz, ent->valp); + } + + /* + * Initialize unused slots of the pointer table. + */ + for (int i = 0; i < ptrcnt; i++) + if (ptrhasht[i] == 0) + ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; + + /* + * Write the whole thing to disk. + */ + dnode = zap->dnode; + dnode->dn_nblkptr = 1; + dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; + dnode->dn_maxblkid = lblkcnt + 1; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + + c = dnode_cursor_init(zfs, zap->os, zap->dnode, + (lblkcnt + 1) * blksz, blksz); + + loc = objset_space_alloc(zfs, zap->os, &blksz); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, + dnode_cursor_next(zfs, c, 0)); + + for (uint64_t i = 0; i < lblkcnt; i++) { + loc = objset_space_alloc(zfs, zap->os, &blksz); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, + blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); + } + + dnode_cursor_finish(zfs, c); + + free(leafblks); +} + +void +zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + zfs_zap_entry_t *ent; + + if (zap->micro) { + zap_micro_write(zfs, zap); + } else { + assert(!STAILQ_EMPTY(&zap->kvps)); + assert(zap->kvpcnt > 0); + zap_fat_write(zfs, zap); + } + + while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { + STAILQ_REMOVE_HEAD(&zap->kvps, next); + if (ent->val64p != &ent->val64) + free(ent->valp); + free(ent->name); + free(ent); + } + free(zap); +} diff --git a/usr.sbin/makefs/zfs/zfs.h b/usr.sbin/makefs/zfs/zfs.h new file mode 100644 index 000000000000..b92e2c035669 --- /dev/null +++ b/usr.sbin/makefs/zfs/zfs.h @@ -0,0 +1,167 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _MAKEFS_ZFS_H_ +#define _MAKEFS_ZFS_H_ + +#include <sys/types.h> +#include <sys/queue.h> + +#include <bitstring.h> +#include <stdbool.h> + +#include "makefs.h" + +#include "zfs/nvlist.h" +#define ASSERT assert +#include "zfs/zfsimpl.h" + +#define MAXBLOCKSHIFT 17 /* 128KB */ +#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT)) +_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, ""); +#define MINBLOCKSHIFT 9 /* 512B */ +#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT)) +_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, ""); +#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE) + +/* All data was written in this transaction group. */ +#define TXG 4 + +typedef struct zfs_dsl_dataset zfs_dsl_dataset_t; +typedef struct zfs_dsl_dir zfs_dsl_dir_t; +typedef struct zfs_objset zfs_objset_t; +typedef struct zfs_zap zfs_zap_t; + +struct dataset_desc { + char *params; + STAILQ_ENTRY(dataset_desc) next; +}; + +typedef struct { + bool nowarn; + + /* I/O buffer, just for convenience. */ + char filebuf[MAXBLOCKSIZE]; + + /* Pool parameters. */ + const char *poolname; + char *rootpath; /* implicit mount point prefix */ + char *bootfs; /* bootable dataset, pool property */ + int ashift; /* vdev block size */ + uint64_t mssize; /* metaslab size */ + STAILQ_HEAD(, dataset_desc) datasetdescs; /* non-root dataset descrs */ + + /* Pool state. */ + uint64_t poolguid; /* pool and root vdev GUID */ + zfs_zap_t *poolprops; + + /* MOS state. */ + zfs_objset_t *mos; /* meta object set */ + uint64_t objarrid; /* space map object array */ + + /* DSL state. */ + zfs_dsl_dir_t *rootdsldir; /* root DSL directory */ + zfs_dsl_dataset_t *rootds; + zfs_dsl_dir_t *origindsldir; /* $ORIGIN */ + zfs_dsl_dataset_t *originds; + zfs_dsl_dataset_t *snapds; + zfs_zap_t *cloneszap; + zfs_dsl_dir_t *freedsldir; /* $FREE */ + zfs_dsl_dir_t *mosdsldir; /* $MOS */ + + /* vdev state. */ + int fd; /* vdev disk fd */ + uint64_t vdevguid; /* disk vdev GUID */ + off_t vdevsize; /* vdev size, including labels */ + off_t asize; /* vdev size, excluding labels */ + bitstr_t *spacemap; /* space allocation tracking */ + int spacemapbits; /* one bit per ashift-sized block */ + uint64_t msshift; /* log2(metaslab size) */ + uint64_t mscount; /* number of metaslabs for this vdev */ +} zfs_opt_t; + +/* dsl.c */ +void dsl_init(zfs_opt_t *); +const char *dsl_dir_fullname(const zfs_dsl_dir_t *); +uint64_t dsl_dir_id(zfs_dsl_dir_t *); +uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *); +void dsl_dir_foreach(zfs_opt_t *, zfs_dsl_dir_t *, + void (*)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *); +int dsl_dir_get_canmount(zfs_dsl_dir_t *, uint64_t *); +char *dsl_dir_get_mountpoint(zfs_opt_t *, zfs_dsl_dir_t *); +bool dsl_dir_has_dataset(zfs_dsl_dir_t *); +bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *); +void dsl_dir_dataset_write(zfs_opt_t *, zfs_objset_t *, zfs_dsl_dir_t *); +void dsl_dir_size_set(zfs_dsl_dir_t *, uint64_t); +void dsl_write(zfs_opt_t *); + +/* fs.c */ +void fs_build(zfs_opt_t *, int, fsnode *); + +/* objset.c */ +zfs_objset_t *objset_alloc(zfs_opt_t *zfs, uint64_t type); +off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *); +dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *); +dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t, + uint16_t, uint64_t *); +dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t); +void objset_root_blkptr_copy(const zfs_objset_t *, blkptr_t *); +uint64_t objset_space(const zfs_objset_t *); +void objset_write(zfs_opt_t *zfs, zfs_objset_t *os); + +/* vdev.c */ +void vdev_init(zfs_opt_t *, const char *); +off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp); +void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, + uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, + blkptr_t *bp); +void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, + uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp); +void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, + off_t sz, off_t loc); +void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp); +void vdev_spacemap_write(zfs_opt_t *); +void vdev_fini(zfs_opt_t *zfs); + +/* zap.c */ +zfs_zap_t *zap_alloc(zfs_objset_t *, dnode_phys_t *); +void zap_add(zfs_zap_t *, const char *, size_t, size_t, const uint8_t *); +void zap_add_uint64(zfs_zap_t *, const char *, uint64_t); +void zap_add_string(zfs_zap_t *, const char *, const char *); +bool zap_entry_exists(zfs_zap_t *, const char *); +void zap_write(zfs_opt_t *, zfs_zap_t *); + +/* zfs.c */ +struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *, + dnode_phys_t *, off_t, off_t); +blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, off_t); +void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *); + +#endif /* !_MAKEFS_ZFS_H_ */ |