diff options
author | Dag-Erling Smørgrav <des@FreeBSD.org> | 2023-02-02 17:18:41 +0000 |
---|---|---|
committer | Dag-Erling Smørgrav <des@FreeBSD.org> | 2023-02-02 17:19:29 +0000 |
commit | 69d94f4c7608e41505996559367450706e91fbb8 (patch) | |
tree | 36c88cb17cfb46c370839c6068ef3c424c463df0 | |
parent | f29942229d24ebb8b98f8c5d02f3c8632648007e (diff) | |
download | src-69d94f4c7608.tar.gz src-69d94f4c7608.zip |
Add tarfs, a filesystem backed by tarballs.
Sponsored by: Juniper Networks, Inc.
Sponsored by: Klara, Inc.
Reviewed by: pauamma, imp
Differential Revision: https://reviews.freebsd.org/D37753
-rw-r--r-- | etc/mtree/BSD.tests.dist | 2 | ||||
-rw-r--r-- | share/man/man5/Makefile | 1 | ||||
-rw-r--r-- | share/man/man5/tarfs.5 | 103 | ||||
-rw-r--r-- | sys/conf/files | 4 | ||||
-rw-r--r-- | sys/conf/options | 4 | ||||
-rw-r--r-- | sys/fs/tarfs/tarfs.h | 254 | ||||
-rw-r--r-- | sys/fs/tarfs/tarfs_dbg.h | 65 | ||||
-rw-r--r-- | sys/fs/tarfs/tarfs_io.c | 727 | ||||
-rw-r--r-- | sys/fs/tarfs/tarfs_subr.c | 603 | ||||
-rw-r--r-- | sys/fs/tarfs/tarfs_vfsops.c | 1173 | ||||
-rw-r--r-- | sys/fs/tarfs/tarfs_vnops.c | 642 | ||||
-rw-r--r-- | sys/kern/subr_witness.c | 6 | ||||
-rw-r--r-- | sys/modules/Makefile | 1 | ||||
-rw-r--r-- | sys/modules/tarfs/Makefile | 23 | ||||
-rw-r--r-- | tests/sys/fs/Makefile | 1 | ||||
-rw-r--r-- | tests/sys/fs/tarfs/Makefile | 10 | ||||
-rw-r--r-- | tests/sys/fs/tarfs/mktar.c | 238 | ||||
-rw-r--r-- | tests/sys/fs/tarfs/tarfs_test.sh | 54 |
18 files changed, 3911 insertions, 0 deletions
diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist index 0d05ecaf06fc..b4b18997b7f9 100644 --- a/etc/mtree/BSD.tests.dist +++ b/etc/mtree/BSD.tests.dist @@ -757,6 +757,8 @@ fs fusefs .. + tarfs + .. tmpfs .. .. diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile index 2d49d981c2f9..f6e91e4ed00b 100644 --- a/share/man/man5/Makefile +++ b/share/man/man5/Makefile @@ -70,6 +70,7 @@ MAN= acct.5 \ style.Makefile.5 \ style.mdoc.5 \ sysctl.conf.5 \ + tarfs.5 \ tmpfs.5 \ unionfs.5 diff --git a/share/man/man5/tarfs.5 b/share/man/man5/tarfs.5 new file mode 100644 index 000000000000..b25131c323c1 --- /dev/null +++ b/share/man/man5/tarfs.5 @@ -0,0 +1,103 @@ +.\"- +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2022 Klara, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd February 2, 2023 +.Dt TARFS 5 +.Os +.Sh NAME +.Nm tarfs +.Nd tarball filesystem +.Sh SYNOPSIS +To compile this driver into the kernel, place the following line in +your kernel configuration file: +.Bd -ragged -offset indent +.Cd "options TARFS" +.Ed +.Pp +Alternatively, to load the driver as a module at boot time, place the +following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +tarfs_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +driver implementes a read-only filesystem backed by a +.Xr tar 5 +file. +Currently, only POSIX archives, optionally compressed with +.Xr zstd 1 , +are supported. +.Pp +The preferred I/O size for +.Nm +filesystems can be adjusted using the +.Va vfs.tarfs.ioshift +sysctl setting and tunable. +Setting it to 0 will reset it to its default value. +Note that changes to this setting only apply to filesystems mounted +after the change. +.Sh DIAGNOSTICS +If enabled by the +.Dv TARFS_DEBUG +kernel option, the +.Va vfs.tarfs.debug +sysctl setting can be used to control debugging output from the +.Nm +driver. +Debugging output for individual sections of the driver can be enabled +by adding together the relevant values from the table below. +.Bl -column Value Description +.It 0x01 Ta Memory allocations +.It 0x02 Ta Checksum calculations +.It 0x04 Ta Filesystem operations (vfsops) +.It 0x08 Ta Path lookups +.It 0x10 Ta File operations (vnops) +.It 0x20 Ta General I/O +.It 0x40 Ta Decompression +.It 0x80 Ta Decompression index +.It 0x100 Ta Sparse file mapping +.El +.Sh SEE ALSO +.Xr tar 1 , +.Xr zstd 1 , +.Xr fstab 5 , +.Xr tar 5 , +.Xr mount 8 , +.Xr sysctl 8 +.Sh HISTORY +.An -nosplit +The +.Nm +driver was developed by +.An Stephen J. Kiernan Aq Mt stevek@FreeBSD.org +and +.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org +for Juniper Networks and Klara Systems. +This manual page was written by +.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org +for Juniper Networks and Klara Systems. diff --git a/sys/conf/files b/sys/conf/files index 6cb4abcd9223..08966a9b46e4 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3615,6 +3615,10 @@ fs/smbfs/smbfs_smb.c optional smbfs fs/smbfs/smbfs_subr.c optional smbfs fs/smbfs/smbfs_vfsops.c optional smbfs fs/smbfs/smbfs_vnops.c optional smbfs +fs/tarfs/tarfs_io.c optional tarfs compile-with "${NORMAL_C} -I$S/contrib/zstd/lib/freebsd" +fs/tarfs/tarfs_subr.c optional tarfs +fs/tarfs/tarfs_vfsops.c optional tarfs +fs/tarfs/tarfs_vnops.c optional tarfs fs/udf/osta.c optional udf fs/udf/udf_iconv.c optional udf_iconv fs/udf/udf_vfsops.c optional udf diff --git a/sys/conf/options b/sys/conf/options index 1f5003507539..3b2be66ba602 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -265,6 +265,7 @@ NULLFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h SMBFS opt_dontuse.h +TARFS opt_dontuse.h TMPFS opt_dontuse.h UDF opt_dontuse.h UNIONFS opt_dontuse.h @@ -273,6 +274,9 @@ ZFS opt_dontuse.h # Pseudofs debugging PSEUDOFS_TRACE opt_pseudofs.h +# Tarfs debugging +TARFS_DEBUG opt_tarfs.h + # In-kernel GSS-API KGSSAPI opt_kgssapi.h KGSSAPI_DEBUG opt_kgssapi.h diff --git a/sys/fs/tarfs/tarfs.h b/sys/fs/tarfs/tarfs.h new file mode 100644 index 000000000000..dffd60ee6d8a --- /dev/null +++ b/sys/fs/tarfs/tarfs.h @@ -0,0 +1,254 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Juniper Networks, Inc. + * Copyright (c) 2022-2023 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _FS_TARFS_TARFS_H_ +#define _FS_TARFS_TARFS_H_ + +#ifndef _KERNEL +#error Should only be included by kernel +#endif + +MALLOC_DECLARE(M_TARFSMNT); +MALLOC_DECLARE(M_TARFSNODE); +MALLOC_DECLARE(M_TARFSNAME); + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_vfs_tarfs); +#endif + +struct componentname; +struct mount; +struct vnode; + +/* + * Internal representation of a tarfs file system node. + */ +struct tarfs_node { + TAILQ_ENTRY(tarfs_node) entries; + TAILQ_ENTRY(tarfs_node) dirents; + + struct mtx lock; + + struct vnode *vnode; + struct tarfs_mount *tmp; + enum vtype type; + ino_t ino; + off_t offset; + size_t size; + size_t physize; + char *name; + size_t namelen; + + /* Node attributes */ + uid_t uid; + gid_t gid; + mode_t mode; + unsigned int flags; + nlink_t nlink; + struct timespec atime; + struct timespec mtime; + struct timespec ctime; + struct timespec birthtime; + unsigned long gen; + + /* Block map */ + size_t nblk; + struct tarfs_blk *blk; + + struct tarfs_node *parent; + union { + /* VDIR */ + struct { + TAILQ_HEAD(, tarfs_node) dirhead; + off_t lastcookie; + struct tarfs_node *lastnode; + } dir; + + /* VLNK */ + struct { + char *name; + size_t namelen; + } link; + + /* VBLK or VCHR */ + dev_t rdev; + + /* VREG */ + struct tarfs_node *other; + }; +}; + +/* + * Entry in sparse file block map. + */ +struct tarfs_blk { + off_t i; /* input (physical) offset */ + off_t o; /* output (logical) offset */ + size_t l; /* length */ +}; + +/* + * Decompression buffer. + */ +#define TARFS_ZBUF_SIZE 1048576 +struct tarfs_zbuf { + u_char buf[TARFS_ZBUF_SIZE]; + size_t off; /* offset of contents */ + size_t len; /* length of contents */ +}; + +/* + * Internal representation of a tarfs mount point. + */ +struct tarfs_mount { + TAILQ_HEAD(, tarfs_node) allnodes; + struct mtx allnode_lock; + + struct tarfs_node *root; + struct vnode *vp; + struct mount *vfs; + ino_t ino; + struct unrhdr *ino_unr; + size_t iosize; + size_t nblocks; + size_t nfiles; + time_t mtime; /* default mtime for directories */ + + struct tarfs_zio *zio; + struct vnode *znode; +}; + +struct tarfs_zio { + struct tarfs_mount *tmp; + + /* decompression state */ +#ifdef ZSTDIO + struct tarfs_zstd *zstd; /* decompression state (zstd) */ +#endif + off_t ipos; /* current input position */ + off_t opos; /* current output position */ + + /* index of compression frames */ + unsigned int curidx; /* current index position*/ + unsigned int nidx; /* number of index entries */ + unsigned int szidx; /* index capacity */ + struct tarfs_idx { off_t i, o; } *idx; +}; + +struct tarfs_fid { + u_short len; /* length of data in bytes */ + u_short data0; /* force alignment */ + ino_t ino; + unsigned long gen; +}; + +#define TARFS_NODE_LOCK(tnp) \ + mtx_lock(&(tnp)->lock) +#define TARFS_NODE_UNLOCK(tnp) \ + mtx_unlock(&(tnp)->lock) +#define TARFS_ALLNODES_LOCK(tnp) \ + mtx_lock(&(tmp)->allnode_lock) +#define TARFS_ALLNODES_UNLOCK(tnp) \ + mtx_unlock(&(tmp)->allnode_lock) + +/* + * Data and metadata within tar files are aligned on 512-byte boundaries, + * to match the block size of the magnetic tapes they were originally + * intended for. + */ +#define TARFS_BSHIFT 9 +#define TARFS_BLOCKSIZE (size_t)(1U << TARFS_BSHIFT) +#define TARFS_BLKOFF(l) ((l) % TARFS_BLOCKSIZE) +#define TARFS_BLKNUM(l) ((l) >> TARFS_BSHIFT) +#define TARFS_SZ2BLKS(sz) (((sz) + TARFS_BLOCKSIZE - 1) / TARFS_BLOCKSIZE) + +/* + * Our preferred I/O size. + */ +extern unsigned int tarfs_ioshift; +#define TARFS_IOSHIFT_MIN TARFS_BSHIFT +#define TARFS_IOSHIFT_DEFAULT PAGE_SHIFT +#define TARFS_IOSHIFT_MAX PAGE_SHIFT + +#define TARFS_ROOTINO ((ino_t)3) +#define TARFS_ZIOINO ((ino_t)4) +#define TARFS_MININO ((ino_t)65535) + +#define TARFS_COOKIE_DOT 0 +#define TARFS_COOKIE_DOTDOT 1 +#define TARFS_COOKIE_EOF OFF_MAX + +#define TARFS_ZIO_NAME ".tar" +#define TARFS_ZIO_NAMELEN (sizeof(TARFS_ZIO_NAME) - 1) + +extern struct vop_vector tarfs_vnodeops; + +static inline +struct tarfs_mount * +MP_TO_TARFS_MOUNT(struct mount *mp) +{ + + MPASS(mp != NULL && mp->mnt_data != NULL); + return (mp->mnt_data); +} + +static inline +struct tarfs_node * +VP_TO_TARFS_NODE(struct vnode *vp) +{ + + MPASS(vp != NULL && vp->v_data != NULL); + return (vp->v_data); +} + +int tarfs_alloc_node(struct tarfs_mount *tmp, const char *name, + size_t namelen, enum vtype type, off_t off, size_t sz, + time_t mtime, uid_t uid, gid_t gid, mode_t mode, + unsigned int flags, const char *linkname, dev_t rdev, + struct tarfs_node *parent, struct tarfs_node **node); +int tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize); +void tarfs_dump_tree(struct tarfs_node *tnp); +void tarfs_free_node(struct tarfs_node *tnp); +struct tarfs_node * + tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie); +struct tarfs_node * + tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f, + struct componentname *cnp); +void tarfs_print_node(struct tarfs_node *tnp); +int tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop); + +int tarfs_io_init(struct tarfs_mount *tmp); +int tarfs_io_fini(struct tarfs_mount *tmp); +int tarfs_io_read(struct tarfs_mount *tmp, bool raw, + struct uio *uiop); +ssize_t tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw, + void *buf, off_t off, size_t len); +unsigned int + tarfs_strtofflags(const char *str, char **end); + +#endif /* _FS_TARFS_TARFS_H_ */ diff --git a/sys/fs/tarfs/tarfs_dbg.h b/sys/fs/tarfs/tarfs_dbg.h new file mode 100644 index 000000000000..45d11d679719 --- /dev/null +++ b/sys/fs/tarfs/tarfs_dbg.h @@ -0,0 +1,65 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Juniper Networks, Inc. + * Copyright (c) 2022 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _FS_TARFS_TARFS_DBG_H_ +#define _FS_TARFS_TARFS_DBG_H_ + +#ifndef _KERNEL +#error Should only be included by kernel +#endif + +#ifdef TARFS_DEBUG +extern int tarfs_debug; + +#define TARFS_DEBUG_ALLOC 0x01 +#define TARFS_DEBUG_CHECKSUM 0x02 +#define TARFS_DEBUG_FS 0x04 +#define TARFS_DEBUG_LOOKUP 0x08 +#define TARFS_DEBUG_VNODE 0x10 +#define TARFS_DEBUG_IO 0x20 +#define TARFS_DEBUG_ZIO 0x40 +#define TARFS_DEBUG_ZIDX 0x80 +#define TARFS_DEBUG_MAP 0x100 + +#define TARFS_DPF(category, fmt, ...) \ + do { \ + if ((tarfs_debug & TARFS_DEBUG_##category) != 0) \ + printf(fmt, ## __VA_ARGS__); \ + } while (0) +#define TARFS_DPF_IFF(category, cond, fmt, ...) \ + do { \ + if ((cond) \ + && (tarfs_debug & TARFS_DEBUG_##category) != 0) \ + printf(fmt, ## __VA_ARGS__); \ + } while (0) +#else +#define TARFS_DPF(category, fmt, ...) +#define TARFS_DPF_IFF(category, cond, fmt, ...) +#endif + +#endif /* _FS_TARFS_TARFS_DBG_H_ */ diff --git a/sys/fs/tarfs/tarfs_io.c b/sys/fs/tarfs/tarfs_io.c new file mode 100644 index 000000000000..b957ac11ff51 --- /dev/null +++ b/sys/fs/tarfs/tarfs_io.c @@ -0,0 +1,727 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Juniper Networks, Inc. + * Copyright (c) 2022-2023 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_tarfs.h" +#include "opt_zstdio.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/uio.h> +#include <sys/vnode.h> + +#ifdef ZSTDIO +#define ZSTD_STATIC_LINKING_ONLY +#include <contrib/zstd/lib/zstd.h> +#endif + +#include <fs/tarfs/tarfs.h> +#include <fs/tarfs/tarfs_dbg.h> + +#ifdef TARFS_DEBUG +SYSCTL_NODE(_vfs_tarfs, OID_AUTO, zio, CTLFLAG_RD, 0, + "Tar filesystem decompression layer"); +COUNTER_U64_DEFINE_EARLY(tarfs_zio_inflated); +SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, inflated, CTLFLAG_RD, + &tarfs_zio_inflated, "Amount of compressed data inflated."); +COUNTER_U64_DEFINE_EARLY(tarfs_zio_consumed); +SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, consumed, CTLFLAG_RD, + &tarfs_zio_consumed, "Amount of compressed data consumed."); +COUNTER_U64_DEFINE_EARLY(tarfs_zio_bounced); +SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, bounced, CTLFLAG_RD, + &tarfs_zio_bounced, "Amount of decompressed data bounced."); + +static int +tarfs_sysctl_handle_zio_reset(SYSCTL_HANDLER_ARGS) +{ + unsigned int tmp; + int error; + + tmp = 0; + if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0) + return (error); + if (req->newptr != NULL) { + if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0) + return (error); + counter_u64_zero(tarfs_zio_inflated); + counter_u64_zero(tarfs_zio_consumed); + counter_u64_zero(tarfs_zio_bounced); + } + return (0); +} + +SYSCTL_PROC(_vfs_tarfs_zio, OID_AUTO, reset, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, + NULL, 0, tarfs_sysctl_handle_zio_reset, "IU", + "Reset compression counters."); +#endif + +MALLOC_DEFINE(M_TARFSZSTATE, "tarfs zstate", "tarfs decompression state"); +MALLOC_DEFINE(M_TARFSZBUF, "tarfs zbuf", "tarfs decompression buffers"); + +#define XZ_MAGIC (uint8_t[]){ 0xfd, 0x37, 0x7a, 0x58, 0x5a } +#define ZLIB_MAGIC (uint8_t[]){ 0x1f, 0x8b, 0x08 } +#define ZSTD_MAGIC (uint8_t[]){ 0x28, 0xb5, 0x2f, 0xfd } + +#ifdef ZSTDIO +struct tarfs_zstd { + ZSTD_DStream *zds; +}; +#endif + +/* XXX review use of curthread / uio_td / td_cred */ + +/* + * Reads from the tar file according to the provided uio. If the archive + * is compressed and raw is false, reads the decompressed stream; + * otherwise, reads directly from the original file. Returns 0 on success + * and a positive errno value on failure. + */ +int +tarfs_io_read(struct tarfs_mount *tmp, bool raw, struct uio *uiop) +{ + void *rl = NULL; + off_t off = uiop->uio_offset; + size_t len = uiop->uio_resid; + int error; + + if (raw || tmp->znode == NULL) { + rl = vn_rangelock_rlock(tmp->vp, off, off + len); + error = vn_lock(tmp->vp, LK_SHARED); + if (error == 0) { + error = VOP_READ(tmp->vp, uiop, + IO_DIRECT|IO_NODELOCKED, + uiop->uio_td->td_ucred); + VOP_UNLOCK(tmp->vp); + } + vn_rangelock_unlock(tmp->vp, rl); + } else { + error = vn_lock(tmp->znode, LK_EXCLUSIVE); + if (error == 0) { + error = VOP_READ(tmp->znode, uiop, + IO_DIRECT | IO_NODELOCKED, + uiop->uio_td->td_ucred); + VOP_UNLOCK(tmp->znode); + } + } + TARFS_DPF(IO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__, + (size_t)off, len, error, uiop->uio_resid); + return (error); +} + +/* + * Reads from the tar file into the provided buffer. If the archive is + * compressed and raw is false, reads the decompressed stream; otherwise, + * reads directly from the original file. Returns the number of bytes + * read on success, 0 on EOF, and a negative errno value on failure. + */ +ssize_t +tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw, + void *buf, off_t off, size_t len) +{ + struct uio auio; + struct iovec aiov; + ssize_t res; + int error; + + if (len == 0) { + TARFS_DPF(IO, "%s(%zu, %zu) null\n", __func__, + (size_t)off, len); + return (0); + } + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = off; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_resid = len; + auio.uio_td = curthread; + error = tarfs_io_read(tmp, raw, &auio); + if (error != 0) { + TARFS_DPF(IO, "%s(%zu, %zu) error %d\n", __func__, + (size_t)off, len, error); + return (-error); + } + res = len - auio.uio_resid; + if (res == 0 && len != 0) { + TARFS_DPF(IO, "%s(%zu, %zu) eof\n", __func__, + (size_t)off, len); + } else { + TARFS_DPF(IO, "%s(%zu, %zu) read %zd | %*D\n", __func__, + (size_t)off, len, res, + (int)(res > 8 ? 8 : res), (uint8_t *)buf, " "); + } + return (res); +} + +#ifdef ZSTDIO +static void * +tarfs_zstate_alloc(void *opaque, size_t size) +{ + + (void)opaque; + return (malloc(size, M_TARFSZSTATE, M_WAITOK)); +} +#endif + +#ifdef ZSTDIO +static void +tarfs_zstate_free(void *opaque, void *address) +{ + + (void)opaque; + free(address, M_TARFSZSTATE); +} +#endif + +#ifdef ZSTDIO +static ZSTD_customMem tarfs_zstd_mem = { + tarfs_zstate_alloc, + tarfs_zstate_free, + NULL, +}; +#endif + +/* + * Updates the decompression frame index, recording the current input and + * output offsets in a new index entry, and growing the index if + * necessary. + */ +static void +tarfs_zio_update_index(struct tarfs_zio *zio, off_t i, off_t o) +{ + + if (++zio->curidx >= zio->nidx) { + if (++zio->nidx > zio->szidx) { + zio->szidx *= 2; + zio->idx = realloc(zio->idx, + zio->szidx * sizeof(*zio->idx), + M_TARFSZSTATE, M_ZERO | M_WAITOK); + TARFS_DPF(ALLOC, "%s: resized zio index\n", __func__); + } + zio->idx[zio->curidx].i = i; + zio->idx[zio->curidx].o = o; + TARFS_DPF(ZIDX, "%s: index %u = i %zu o %zu\n", __func__, + zio->curidx, (size_t)zio->idx[zio->curidx].i, + (size_t)zio->idx[zio->curidx].o); + } + MPASS(zio->idx[zio->curidx].i == i); + MPASS(zio->idx[zio->curidx].o == o); +} + +/* + * VOP_ACCESS for zio node. + */ +static int +tarfs_zaccess(struct vop_access_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct tarfs_zio *zio = vp->v_data; + struct tarfs_mount *tmp = zio->tmp; + accmode_t accmode = ap->a_accmode; + int error = EPERM; + + if (accmode == VREAD) { + error = vn_lock(tmp->vp, LK_SHARED); + if (error == 0) { + error = VOP_ACCESS(tmp->vp, accmode, ap->a_cred, ap->a_td); + VOP_UNLOCK(tmp->vp); + } + } + TARFS_DPF(ZIO, "%s(%d) = %d\n", __func__, accmode, error); + return (error); +} + +/* + * VOP_GETATTR for zio node. + */ +static int +tarfs_zgetattr(struct vop_getattr_args *ap) +{ + struct vattr va; + struct vnode *vp = ap->a_vp; + struct tarfs_zio *zio = vp->v_data; + struct tarfs_mount *tmp = zio->tmp; + struct vattr *vap = ap->a_vap; + int error = 0; + + VATTR_NULL(vap); + error = vn_lock(tmp->vp, LK_SHARED); + if (error == 0) { + error = VOP_GETATTR(tmp->vp, &va, ap->a_cred); + VOP_UNLOCK(tmp->vp); + if (error == 0) { + vap->va_type = VREG; + vap->va_mode = va.va_mode; + vap->va_nlink = 1; + vap->va_gid = va.va_gid; + vap->va_uid = va.va_uid; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_fileid = TARFS_ZIOINO; + vap->va_size = zio->idx[zio->nidx - 1].o; + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_atime = va.va_atime; + vap->va_ctime = va.va_ctime; + vap->va_mtime = va.va_mtime; + vap->va_birthtime = tmp->root->birthtime; + vap->va_bytes = va.va_bytes; + } + } + TARFS_DPF(ZIO, "%s() = %d\n", __func__, error); + return (error); +} + +#ifdef ZSTDIO +/* + * VOP_READ for zio node, zstd edition. + */ +static int +tarfs_zread_zstd(struct tarfs_zio *zio, struct uio *uiop) +{ + void *ibuf = NULL, *obuf = NULL, *rl = NULL; + struct uio auio; + struct iovec aiov; + struct tarfs_mount *tmp = zio->tmp; + struct tarfs_zstd *zstd = zio->zstd; + struct thread *td = curthread; + ZSTD_inBuffer zib; + ZSTD_outBuffer zob; + off_t zsize; + off_t ipos, opos; + size_t ilen, olen; + size_t zerror; + off_t off = uiop->uio_offset; + size_t len = uiop->uio_resid; + size_t resid = uiop->uio_resid; + size_t bsize; + int error; + bool reset = false; + + /* do we have to rewind? */ + if (off < zio->opos) { + while (zio->curidx > 0 && off < zio->idx[zio->curidx].o) + zio->curidx--; + reset = true; + } + /* advance to the nearest index entry */ + if (off > zio->opos) { + // XXX maybe do a binary search instead + while (zio->curidx < zio->nidx - 1 && + off >= zio->idx[zio->curidx + 1].o) { + zio->curidx++; + reset = true; + } + } + /* reset the decompression stream if needed */ + if (reset) { + zio->ipos = zio->idx[zio->curidx].i; + zio->opos = zio->idx[zio->curidx].o; + ZSTD_resetDStream(zstd->zds); + TARFS_DPF(ZIDX, "%s: skipping to index %u = i %zu o %zu\n", __func__, + zio->curidx, (size_t)zio->ipos, (size_t)zio->opos); + } else { + TARFS_DPF(ZIDX, "%s: continuing at i %zu o %zu\n", __func__, + (size_t)zio->ipos, (size_t)zio->opos); + } + + /* + * Set up a temporary buffer for compressed data. Use the size + * recommended by the zstd library; this is usually 128 kB, but + * just in case, make sure it's a multiple of the page size and no + * larger than MAXBSIZE. + */ + bsize = roundup(ZSTD_CStreamOutSize(), PAGE_SIZE); + if (bsize > MAXBSIZE) + bsize = MAXBSIZE; + ibuf = malloc(bsize, M_TEMP, M_WAITOK); + zib.src = NULL; + zib.size = 0; + zib.pos = 0; + + /* + * Set up the decompression buffer. If the target is not in + * kernel space, we will have to set up a bounce buffer. + * + * TODO: to avoid using a bounce buffer, map destination pages + * using vm_fault_quick_hold_pages(). + */ + MPASS(zio->opos <= off); + MPASS(uiop->uio_iovcnt == 1); + MPASS(uiop->uio_iov->iov_len >= len); + if (uiop->uio_segflg == UIO_SYSSPACE) { + zob.dst = uiop->uio_iov->iov_base; + } else { + TARFS_DPF(ALLOC, "%s: allocating %zu-byte bounce buffer\n", + __func__, len); + zob.dst = obuf = malloc(len, M_TEMP, M_WAITOK); + } + zob.size = len; + zob.pos = 0; + + /* lock tarball */ + rl = vn_rangelock_rlock(tmp->vp, zio->ipos, OFF_MAX); + error = vn_lock(tmp->vp, LK_SHARED); + if (error != 0) { + goto fail_unlocked; + } + /* check size */ + error = vn_getsize_locked(tmp->vp, &zsize, td->td_ucred); + if (error != 0) { + goto fail; + } + if (zio->ipos >= zsize) { + /* beyond EOF */ + goto fail; + } + + while (resid > 0) { + if (zib.pos == zib.size) { + /* request data from the underlying file */ + aiov.iov_base = ibuf; + aiov.iov_len = bsize; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = zio->ipos; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_resid = aiov.iov_len; + auio.uio_td = td; + error = VOP_READ(tmp->vp, &auio, + IO_DIRECT | IO_NODELOCKED, + td->td_ucred); + if (error != 0) + goto fail; + TARFS_DPF(ZIO, "%s: req %zu+%zu got %zu+%zu\n", __func__, + (size_t)zio->ipos, bsize, + (size_t)zio->ipos, bsize - auio.uio_resid); + zib.src = ibuf; + zib.size = bsize - auio.uio_resid; + zib.pos = 0; + } + MPASS(zib.pos <= zib.size); + if (zib.pos == zib.size) { + TARFS_DPF(ZIO, "%s: end of file after i %zu o %zu\n", __func__, + (size_t)zio->ipos, (size_t)zio->opos); + goto fail; + } + if (zio->opos < off) { + /* to be discarded */ + zob.size = min(off - zio->opos, len); + zob.pos = 0; + } else { + zob.size = len; + zob.pos = zio->opos - off; + } + ipos = zib.pos; + opos = zob.pos; + /* decompress as much as possible */ + zerror = ZSTD_decompressStream(zstd->zds, &zob, &zib); + zio->ipos += ilen = zib.pos - ipos; + zio->opos += olen = zob.pos - opos; + if (zio->opos > off) + resid -= olen; + if (ZSTD_isError(zerror)) { + TARFS_DPF(ZIO, "%s: inflate failed after i %zu o %zu: %s\n", __func__, + (size_t)zio->ipos, (size_t)zio->opos, ZSTD_getErrorName(zerror)); + error = EIO; + goto fail; + } + if (zerror == 0 && olen == 0) { + TARFS_DPF(ZIO, "%s: end of stream after i %zu o %zu\n", __func__, + (size_t)zio->ipos, (size_t)zio->opos); + break; + } + if (zerror == 0) { + TARFS_DPF(ZIO, "%s: end of frame after i %zu o %zu\n", __func__, + (size_t)zio->ipos, (size_t)zio->opos); + tarfs_zio_update_index(zio, zio->ipos, zio->opos); + } + TARFS_DPF(ZIO, "%s: inflated %zu\n", __func__, olen); +#ifdef TARFS_DEBUG + counter_u64_add(tarfs_zio_inflated, olen); +#endif + } +fail: + VOP_UNLOCK(tmp->vp); +fail_unlocked: + if (error == 0) { + if (uiop->uio_segflg == UIO_SYSSPACE) { + uiop->uio_resid = resid; + } else if (len > resid) { + TARFS_DPF(ALLOC, "%s: bounced %zu bytes\n", __func__, + len - resid); + error = uiomove(obuf, len - resid, uiop); +#ifdef TARFS_DEBUG + counter_u64_add(tarfs_zio_bounced, len - resid); +#endif + } + } + if (obuf != NULL) { + TARFS_DPF(ALLOC, "%s: freeing bounce buffer\n", __func__); + free(obuf, M_TEMP); + } + if (rl != NULL) + vn_rangelock_unlock(tmp->vp, rl); + if (ibuf != NULL) + free(ibuf, M_TEMP); + TARFS_DPF(ZIO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__, + (size_t)off, len, error, uiop->uio_resid); +#ifdef TARFS_DEBUG + counter_u64_add(tarfs_zio_consumed, len - uiop->uio_resid); +#endif + if (error != 0) { + zio->curidx = 0; + zio->ipos = zio->idx[0].i; + zio->opos = zio->idx[0].o; + ZSTD_resetDStream(zstd->zds); + } + return (error); +} +#endif + +/* + * VOP_READ for zio node. + */ +static int +tarfs_zread(struct vop_read_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct tarfs_zio *zio = vp->v_data; + struct uio *uiop = ap->a_uio; +#ifdef TARFS_DEBUG + off_t off = uiop->uio_offset; + size_t len = uiop->uio_resid; +#endif + int error; + + TARFS_DPF(ZIO, "%s(%zu, %zu)\n", __func__, + (size_t)off, len); +#ifdef ZSTDIO + if (zio->zstd != NULL) { + error = tarfs_zread_zstd(zio, uiop); + } else +#endif + error = EFTYPE; + TARFS_DPF(ZIO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__, + (size_t)off, len, error, uiop->uio_resid); + return (error); +} + +/* + * VOP_RECLAIM for zio node. + */ +static int +tarfs_zreclaim(struct vop_reclaim_args *ap) +{ + struct vnode *vp = ap->a_vp; + + TARFS_DPF(ZIO, "%s(%p)\n", __func__, vp); + vp->v_data = NULL; + vnode_destroy_vobject(vp); + cache_purge(vp); + return (0); +} + +/* + * VOP_STRATEGY for zio node. + */ +static int +tarfs_zstrategy(struct vop_strategy_args *ap) +{ + struct uio auio; + struct iovec iov; + struct vnode *vp = ap->a_vp; + struct buf *bp = ap->a_bp; + off_t off; + size_t len; + int error; + + iov.iov_base = bp->b_data; + iov.iov_len = bp->b_bcount; + off = bp->b_iooffset; + len = bp->b_bcount; + bp->b_resid = len; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_offset = off; + auio.uio_resid = len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = curthread; + error = VOP_READ(vp, &auio, IO_DIRECT | IO_NODELOCKED, bp->b_rcred); + bp->b_flags |= B_DONE; + if (error != 0) { + bp->b_ioflags |= BIO_ERROR; + bp->b_error = error; + } + return (0); +} + +static struct vop_vector tarfs_znodeops = { + .vop_default = &default_vnodeops, + + .vop_access = tarfs_zaccess, + .vop_getattr = tarfs_zgetattr, + .vop_read = tarfs_zread, + .vop_reclaim = tarfs_zreclaim, + .vop_strategy = tarfs_zstrategy, +}; +VFS_VOP_VECTOR_REGISTER(tarfs_znodeops); + +/* + * Initializes the decompression layer. + */ +static struct tarfs_zio * +tarfs_zio_init(struct tarfs_mount *tmp, off_t i, off_t o) +{ + struct tarfs_zio *zio; + struct vnode *zvp; + + zio = malloc(sizeof(*zio), M_TARFSZSTATE, M_ZERO | M_WAITOK); + TARFS_DPF(ALLOC, "%s: allocated zio\n", __func__); + zio->tmp = tmp; + zio->szidx = 128; + zio->idx = malloc(zio->szidx * sizeof(*zio->idx), M_TARFSZSTATE, + M_ZERO | M_WAITOK); + zio->curidx = 0; + zio->nidx = 1; + zio->idx[zio->curidx].i = zio->ipos = i; + zio->idx[zio->curidx].o = zio->opos = o; + tmp->zio = zio; + TARFS_DPF(ALLOC, "%s: allocated zio index\n", __func__); + getnewvnode("tarfsz", tmp->vfs, &tarfs_znodeops, &zvp); + zvp->v_data = zio; + zvp->v_type = VREG; + zvp->v_mount = tmp->vfs; + vn_set_state(zvp, VSTATE_CONSTRUCTED); + tmp->znode = zvp; + TARFS_DPF(ZIO, "%s: created zio node\n", __func__); + return (zio); +} + +/* + * Initializes the I/O layer, including decompression if the signature of + * a supported compression format is detected. Returns 0 on success and a + * positive errno value on failure. + */ +int +tarfs_io_init(struct tarfs_mount *tmp) +{ + uint8_t *block; + struct tarfs_zio *zio = NULL; + ssize_t res; + int error = 0; + + block = malloc(tmp->iosize, M_TEMP, M_ZERO | M_WAITOK); + res = tarfs_io_read_buf(tmp, true, block, 0, tmp->iosize); + if (res < 0) { + return (-res); + } + if (memcmp(block, XZ_MAGIC, sizeof(XZ_MAGIC)) == 0) { + printf("xz compression not supported\n"); + error = EOPNOTSUPP; + goto bad; + } else if (memcmp(block, ZLIB_MAGIC, sizeof(ZLIB_MAGIC)) == 0) { + printf("zlib compression not supported\n"); + error = EOPNOTSUPP; + goto bad; + } else if (memcmp(block, ZSTD_MAGIC, sizeof(ZSTD_MAGIC)) == 0) { +#ifdef ZSTDIO + zio = tarfs_zio_init(tmp, 0, 0); + zio->zstd = malloc(sizeof(*zio->zstd), M_TARFSZSTATE, M_WAITOK); + zio->zstd->zds = ZSTD_createDStream_advanced(tarfs_zstd_mem); + (void)ZSTD_initDStream(zio->zstd->zds); +#else + printf("zstd compression not supported\n"); + error = EOPNOTSUPP; + goto bad; +#endif + } +bad: + free(block, M_TEMP); + return (error); +} + +/* + * Tears down the decompression layer. + */ +static int +tarfs_zio_fini(struct tarfs_mount *tmp) +{ + struct tarfs_zio *zio = tmp->zio; + int error = 0; + + if (tmp->znode != NULL) { + error = vn_lock(tmp->znode, LK_EXCLUSIVE); + if (error != 0) { + TARFS_DPF(ALLOC, "%s: failed to lock znode", __func__); + return (error); + } + tmp->znode->v_mount = NULL; + vgone(tmp->znode); + vput(tmp->znode); + tmp->znode = NULL; + } +#ifdef ZSTDIO + if (zio->zstd != NULL) { + TARFS_DPF(ALLOC, "%s: freeing zstd state\n", __func__); + ZSTD_freeDStream(zio->zstd->zds); + free(zio->zstd, M_TARFSZSTATE); + } +#endif + if (zio->idx != NULL) { + TARFS_DPF(ALLOC, "%s: freeing index\n", __func__); + free(zio->idx, M_TARFSZSTATE); + } + TARFS_DPF(ALLOC, "%s: freeing zio\n", __func__); + free(zio, M_TARFSZSTATE); + tmp->zio = NULL; + return (error); +} + +/* + * Tears down the I/O layer, including the decompression layer if + * applicable. + */ +int +tarfs_io_fini(struct tarfs_mount *tmp) +{ + int error = 0; + + if (tmp->zio != NULL) { + error = tarfs_zio_fini(tmp); + } + return (error); +} diff --git a/sys/fs/tarfs/tarfs_subr.c b/sys/fs/tarfs/tarfs_subr.c new file mode 100644 index 000000000000..d4bd4e702e08 --- /dev/null +++ b/sys/fs/tarfs/tarfs_subr.c @@ -0,0 +1,603 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Juniper Networks, Inc. + * Copyright (c) 2022-2023 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_tarfs.h" + +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/fcntl.h> +#include <sys/libkern.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> + +#include <vm/vm_param.h> + +#include <fs/tarfs/tarfs.h> +#include <fs/tarfs/tarfs_dbg.h> + +MALLOC_DEFINE(M_TARFSNAME, "tarfs name", "tarfs file names"); +MALLOC_DEFINE(M_TARFSBLK, "tarfs blk", "tarfs block maps"); + +SYSCTL_NODE(_vfs, OID_AUTO, tarfs, CTLFLAG_RW, 0, "Tar filesystem"); + +unsigned int tarfs_ioshift = TARFS_IOSHIFT_DEFAULT; + +static int +tarfs_sysctl_handle_ioshift(SYSCTL_HANDLER_ARGS) +{ + unsigned int tmp; + int error; + + tmp = *(unsigned int *)arg1; + if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0) + return (error); + if (req->newptr != NULL) { + if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0) + return (error); + if (tmp == 0) + tmp = TARFS_IOSHIFT_DEFAULT; + if (tmp < TARFS_IOSHIFT_MIN) + tmp = TARFS_IOSHIFT_MIN; + if (tmp > TARFS_IOSHIFT_MAX) + tmp = TARFS_IOSHIFT_MAX; + *(unsigned int *)arg1 = tmp; + } + return (0); +} + +SYSCTL_PROC(_vfs_tarfs, OID_AUTO, ioshift, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW | CTLFLAG_TUN, + &tarfs_ioshift, 0, tarfs_sysctl_handle_ioshift, "IU", + "Tar filesystem preferred I/O size (log 2)"); + +#ifdef TARFS_DEBUG +int tarfs_debug; +SYSCTL_INT(_vfs_tarfs, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_TUN, + &tarfs_debug, 0, "Tar filesystem debug mask"); +#endif /* TARFS_DEBUG */ + +static void +tarfs_dump_tree_internal(struct tarfs_node *tnp, int indent) +{ + struct tarfs_node *current; + const char *name; + + if (tnp->type != VDIR) + return; + + TAILQ_FOREACH(current, &tnp->dir.dirhead, dirents) { + if (current->name == NULL) + name = "<<root>>"; + else + name = current->name; + printf("%*s%s\n", indent * 4, "", name); + if (current->type == VDIR) + tarfs_dump_tree_internal(current, indent + 1); + } +} + +void +tarfs_dump_tree(struct tarfs_node *tnp) +{ + const char *name; + + if (tnp == NULL) + return; + + if (tnp->name == NULL) + name = "<<root>>"; + else + name = tnp->name; + printf("%s\n", name); + + tarfs_dump_tree_internal(tnp, 1); +} + +void +tarfs_print_node(struct tarfs_node *tnp) +{ + + if (tnp == NULL) + return; + + printf("%s: node %p\n", __func__, tnp); + printf("\tvnode %p\n", tnp->vnode); + printf("\ttmp %p\n", tnp->tmp); + printf("\ttype %d\n", tnp->type); + printf("\tino %lu\n", tnp->ino); + printf("\tsize %zu\n", tnp->size); + printf("\tname %s\n", + (tnp->name == NULL) ? "<<root>>" : tnp->name); + printf("\tnamelen %zu\n", tnp->namelen); + printf("\tuid %d\n", tnp->uid); + printf("\tgid %d\n", tnp->gid); + printf("\tmode o%o\n", tnp->mode); + printf("\tflags %u\n", tnp->flags); + printf("\tnlink %lu\n", tnp->nlink); + printf("\tatime %d\n", (int)tnp->atime.tv_sec); + printf("\tmtime %d\n", (int)tnp->mtime.tv_sec); + printf("\tctime %d\n", (int)tnp->ctime.tv_sec); + printf("\tbirthtime %d\n", (int)tnp->birthtime.tv_sec); + printf("\tgen %lu\n", tnp->gen); + printf("\tparent %p\n", tnp->parent); + + switch (tnp->type) { + case VDIR: + printf("\tdir.lastcookie %jd\n", + tnp->dir.lastcookie); + printf("\tdir.lastnode %p\n", tnp->dir.lastnode); + break; + case VBLK: + case VCHR: + printf("\trdev %lu\n", tnp->rdev); + break; + default: + break; + } +} + +struct tarfs_node * +tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f, + struct componentname *cnp) +{ + boolean_t found; + struct tarfs_node *entry; + + TARFS_DPF(LOOKUP, "%s: name: %.*s\n", __func__, (int)cnp->cn_namelen, + cnp->cn_nameptr); + + found = false; + TAILQ_FOREACH(entry, &tnp->dir.dirhead, dirents) { + if (f != NULL && entry != f) + continue; + + if (entry->namelen == cnp->cn_namelen && + bcmp(entry->name, cnp->cn_nameptr, + entry->namelen) == 0) { + found = 1; + break; + } + } + + if (found) { + if (entry->type == VREG && entry->other != NULL) { + TARFS_DPF_IFF(LOOKUP, "%s: following hard link %p\n", + __func__, entry); + entry = entry->other; + } + TARFS_DPF(LOOKUP, "%s: found tarfs_node %p\n", __func__, + entry); + return (entry); + } + + TARFS_DPF(LOOKUP, "%s: no match found\n", __func__); + return (NULL); +} + +struct tarfs_node * +tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie) +{ + struct tarfs_node *current; + + TARFS_DPF(LOOKUP, "%s: tarfs_node %p, cookie %jd\n", __func__, tnp, + cookie); + TARFS_DPF(LOOKUP, "%s: name: %s\n", __func__, + (tnp->name == NULL) ? "<<root>>" : tnp->name); + + if (cookie == tnp->dir.lastcookie && + tnp->dir.lastnode != NULL) { + TARFS_DPF(LOOKUP, "%s: Using cached entry: tarfs_node %p, " + "cookie %jd\n", __func__, tnp->dir.lastnode, + tnp->dir.lastcookie); + return (tnp->dir.lastnode); + } + + TAILQ_FOREACH(current, &tnp->dir.dirhead, dirents) { + TARFS_DPF(LOOKUP, "%s: tarfs_node %p, current %p, ino %lu\n", + __func__, tnp, current, current->ino); + TARFS_DPF_IFF(LOOKUP, current->name != NULL, + "%s: name: %s\n", __func__, current->name); + if (current->ino == cookie) { + TARFS_DPF(LOOKUP, "%s: Found entry: tarfs_node %p, " + "cookie %lu\n", __func__, current, + current->ino); + break; + } + } + + return (current); +} + +int +tarfs_alloc_node(struct tarfs_mount *tmp, const char *name, size_t namelen, + enum vtype type, off_t off, size_t sz, time_t mtime, uid_t uid, gid_t gid, + mode_t mode, unsigned int flags, const char *linkname, dev_t rdev, + struct tarfs_node *parent, struct tarfs_node **retnode) +{ + struct tarfs_node *tnp; + + TARFS_DPF(ALLOC, "%s(%.*s)\n", __func__, (int)namelen, name); + + tnp = malloc(sizeof(struct tarfs_node), M_TARFSNODE, M_WAITOK | M_ZERO); + mtx_init(&tnp->lock, "tarfs node lock", NULL, MTX_DEF); + tnp->gen = arc4random(); + tnp->tmp = tmp; + if (namelen > 0) { + tnp->name = malloc(namelen + 1, M_TARFSNAME, M_WAITOK); + tnp->namelen = namelen; + memcpy(tnp->name, name, namelen); + tnp->name[namelen] = '\0'; + } + tnp->type = type; + tnp->uid = uid; + tnp->gid = gid; + tnp->mode = mode; + tnp->nlink = 1; + vfs_timestamp(&tnp->atime); + tnp->mtime.tv_sec = mtime; + tnp->birthtime = tnp->atime; + tnp->ctime = tnp->mtime; + if (parent != NULL) { + tnp->ino = alloc_unr(tmp->ino_unr); + } + tnp->offset = off; + tnp->size = tnp->physize = sz; + switch (type) { + case VDIR: + MPASS(parent != tnp); + MPASS(parent != NULL || tmp->root == NULL); + TAILQ_INIT(&tnp->dir.dirhead); + tnp->nlink++; + if (parent == NULL) { + tnp->ino = TARFS_ROOTINO; + } + tnp->physize = 0; + break; + case VLNK: + tnp->link.name = malloc(sz + 1, M_TARFSNAME, + M_WAITOK); + tnp->link.namelen = sz; + memcpy(tnp->link.name, linkname, sz); + tnp->link.name[sz] = '\0'; + break; + case VREG: + /* create dummy block map */ + tnp->nblk = 1; + tnp->blk = malloc(sizeof(*tnp->blk), M_TARFSBLK, M_WAITOK); + tnp->blk[0].i = 0; + tnp->blk[0].o = 0; + tnp->blk[0].l = tnp->physize; + break; + case VFIFO: + /* Nothing extra to do */ + break; + case VBLK: + case VCHR: + tnp->rdev = rdev; + tnp->physize = 0; + break; + default: + panic("%s: type %d not allowed", __func__, type); + } + if (parent != NULL) { + MPASS(parent->type == VDIR); + TARFS_NODE_LOCK(parent); + TAILQ_INSERT_TAIL(&parent->dir.dirhead, tnp, dirents); + parent->size += sizeof(struct tarfs_node); + tnp->parent = parent; + if (type == VDIR) { + parent->nlink++; + } + TARFS_NODE_UNLOCK(parent); + } else { + tnp->parent = tnp; + } + MPASS(tnp->ino != 0); + + TARFS_ALLNODES_LOCK(tmp); + TAILQ_INSERT_TAIL(&tmp->allnodes, tnp, entries); + TARFS_ALLNODES_UNLOCK(tmp); + + *retnode = tnp; + tmp->nfiles++; + return (0); +} + +#define is09(ch) ((ch) >= '0' && (ch) <= '9') + +int +tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize) +{ + struct tarfs_blk *blk = NULL; + char *map = NULL; + size_t nmap = 0, nblk = 0; + char *p, *q; + ssize_t res; + unsigned int i; + long n; + + /* + * Load the entire map into memory. We don't know how big it is, + * but as soon as we start reading it we will know how many + * entries it contains, and then we can count newlines. + */ + do { + nmap++; + if (tnp->size < nmap * TARFS_BLOCKSIZE) { + TARFS_DPF(MAP, "%s: map too large\n", __func__); + goto bad; + } + /* grow the map */ + map = realloc(map, nmap * TARFS_BLOCKSIZE + 1, M_TARFSBLK, + M_ZERO | M_WAITOK); + /* read an additional block */ + res = tarfs_io_read_buf(tnp->tmp, false, + map + (nmap - 1) * TARFS_BLOCKSIZE, + tnp->offset + (nmap - 1) * TARFS_BLOCKSIZE, + TARFS_BLOCKSIZE); + if (res < 0) + return (-res); + else if (res < TARFS_BLOCKSIZE) + return (EIO); + map[nmap * TARFS_BLOCKSIZE] = '\0'; /* sentinel */ + if (nblk == 0) { + n = strtol(p = map, &q, 10); + if (q == p || *q != '\n' || n < 1) + goto syntax; + nblk = n; + } + for (n = 0, p = map; *p != '\0'; ++p) { + if (*p == '\n') { + ++n; + } + } + TARFS_DPF(MAP, "%s: %ld newlines in map\n", __func__, n); + } while (n < nblk * 2 + 1); + TARFS_DPF(MAP, "%s: block map length %zu\n", __func__, nblk); + blk = malloc(sizeof(*blk) * nblk, M_TARFSBLK, M_WAITOK | M_ZERO); + p = strchr(map, '\n') + 1; + for (i = 0; i < nblk; i++) { + if (i == 0) + blk[i].i = nmap * TARFS_BLOCKSIZE; + else + blk[i].i = blk[i - 1].i + blk[i - 1].l; + n = strtol(p, &q, 10); + if (q == p || *q != '\n' || n < 0) + goto syntax; + p = q + 1; + blk[i].o = n; + n = strtol(p, &q, 10); + if (q == p || *q != '\n' || n < 0) + goto syntax; + p = q + 1; + blk[i].l = n; + TARFS_DPF(MAP, "%s: %3d %12zu %12zu %12zu\n", __func__, + i, blk[i].i, blk[i].o, blk[i].l); + /* + * Check block alignment if the block is of non-zero + * length (a zero-length block indicates the end of a + * trailing hole). Checking i indirectly checks the + * previous block's l. It's ok for the final block to + * have an uneven length. + */ + if (blk[i].l == 0) { + TARFS_DPF(MAP, "%s: zero-length block\n", __func__); + } else if (blk[i].i % TARFS_BLOCKSIZE != 0 || + blk[i].o % TARFS_BLOCKSIZE != 0) { + TARFS_DPF(MAP, "%s: misaligned map entry\n", __func__); + goto bad; + } + /* + * Check that this block starts after the end of the + * previous one. + */ + if (i > 0 && blk[i].o < blk[i - 1].o + blk[i - 1].l) { + TARFS_DPF(MAP, "%s: overlapping map entries\n", __func__); + goto bad; + } + /* + * Check that the block is within the file, both + * physically and logically. + */ + if (blk[i].i + blk[i].l > tnp->physize || + blk[i].o + blk[i].l > realsize) { + TARFS_DPF(MAP, "%s: map overflow\n", __func__); + goto bad; + } + } + free(map, M_TARFSBLK); + + /* store in node */ + free(tnp->blk, M_TARFSBLK); + tnp->nblk = nblk; + tnp->blk = blk; + tnp->size = realsize; + return (0); +syntax: + TARFS_DPF(MAP, "%s: syntax error in block map\n", __func__); +bad: + free(map, M_TARFSBLK); + free(blk, M_TARFSBLK); + return (EINVAL); +} + +void +tarfs_free_node(struct tarfs_node *tnp) +{ + struct tarfs_mount *tmp; + + MPASS(tnp != NULL); + tmp = tnp->tmp; + + switch (tnp->type) { + case VLNK: + if (tnp->link.name) + free(tnp->link.name, M_TARFSNAME); + break; + default: + break; + } + if (tnp->name != NULL) + free(tnp->name, M_TARFSNAME); + if (tnp->blk != NULL) + free(tnp->blk, M_TARFSBLK); + if (tnp->ino >= TARFS_MININO) + free_unr(tmp->ino_unr, tnp->ino); + free(tnp, M_TARFSNODE); + tmp->nfiles--; +} + +int +tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop) +{ + struct uio auio; + size_t resid = len; + size_t copylen; + unsigned int i; + int error; + + TARFS_DPF(VNODE, "%s(%s, %zu, %zu)\n", __func__, + tnp->name, uiop->uio_offset, resid); + for (i = 0; i < tnp->nblk && resid > 0; ++i) { + if (uiop->uio_offset > tnp->blk[i].o + tnp->blk[i].l) { + /* skip this block */ + continue; + } + while (resid > 0 && + uiop->uio_offset < tnp->blk[i].o) { + /* move out some zeroes... */ + copylen = tnp->blk[i].o - uiop->uio_offset; + if (copylen > resid) + copylen = resid; + if (copylen > ZERO_REGION_SIZE) + copylen = ZERO_REGION_SIZE; + auio = *uiop; + auio.uio_offset = 0; + auio.uio_resid = copylen; + error = uiomove(__DECONST(void *, zero_region), + copylen, &auio); + if (error != 0) + return (error); + TARFS_DPF(MAP, "%s(%s) = zero %zu\n", __func__, + tnp->name, copylen - auio.uio_resid); + uiop->uio_offset += copylen - auio.uio_resid; + uiop->uio_resid -= copylen - auio.uio_resid; + resid -= copylen - auio.uio_resid; + } + while (resid > 0 && + uiop->uio_offset < tnp->blk[i].o + tnp->blk[i].l) { + /* now actual data */ + copylen = tnp->blk[i].l; + if (copylen > resid) + copylen = resid; + auio = *uiop; + auio.uio_offset = tnp->offset + tnp->blk[i].i + + uiop->uio_offset - tnp->blk[i].o; + auio.uio_resid = copylen; + error = tarfs_io_read(tnp->tmp, false, &auio); + if (error != 0) + return (error); + TARFS_DPF(MAP, "%s(%s) = data %zu\n", __func__, + tnp->name, copylen - auio.uio_resid); + uiop->uio_offset += copylen - auio.uio_resid; + uiop->uio_resid -= copylen - auio.uio_resid; + resid -= copylen - auio.uio_resid; + } + } + TARFS_DPF(VNODE, "%s(%s) = %zu\n", __func__, + tnp->name, len - resid); + return (0); +} + +/* + * XXX ugly file flag parser which could easily be a finite state machine + * driven by a small precomputed table. + * + * Note that unlike strtofflags(3), we make no attempt to handle negated + * flags, since they shouldn't appear in tar files. + */ +static const struct tarfs_flag { + const char *name; + unsigned int flag; +} tarfs_flags[] = { + { "nodump", UF_NODUMP }, + { "uchg", UF_IMMUTABLE }, + { "uappnd", UF_APPEND }, + { "opaque", UF_OPAQUE }, + { "uunlnk", UF_NOUNLINK }, + { "arch", SF_ARCHIVED }, + { "schg", SF_IMMUTABLE }, + { "sappnd", SF_APPEND }, + { "sunlnk", SF_NOUNLINK }, + { NULL, 0 }, +}; + +unsigned int +tarfs_strtofflags(const char *str, char **end) +{ + const struct tarfs_flag *tf; + const char *p, *q; + unsigned int ret; + + ret = 0; + for (p = q = str; *q != '\0'; p = q + 1) { + for (q = p; *q != '\0' && *q != ','; ++q) { + if (*q < 'a' || *q > 'z') { + goto end; + } + /* nothing */ + } + for (tf = tarfs_flags; tf->name != NULL; tf++) { + if (strncmp(tf->name, p, q - p) == 0 && + tf->name[q - p] == '\0') { + TARFS_DPF(ALLOC, "%s: %.*s = 0x%06x\n", __func__, + (int)(q - p), p, tf->flag); + ret |= tf->flag; + break; + } + } + if (tf->name == NULL) { + TARFS_DPF(ALLOC, "%s: %.*s = 0x??????\n", + __func__, (int)(q - p), p); + goto end; + } + } +end: + if (*end != NULL) { + *end = __DECONST(char *, q); + } + return (ret); +} diff --git a/sys/fs/tarfs/tarfs_vfsops.c b/sys/fs/tarfs/tarfs_vfsops.c new file mode 100644 index 000000000000..fe135116c985 --- /dev/null +++ b/sys/fs/tarfs/tarfs_vfsops.c @@ -0,0 +1,1173 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Juniper Networks, Inc. + * Copyright (c) 2022-2023 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_tarfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/libkern.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/sbuf.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/vnode.h> + +#include <vm/vm_param.h> + +#include <geom/geom.h> +#include <geom/geom_vfs.h> + +#include <fs/tarfs/tarfs.h> +#include <fs/tarfs/tarfs_dbg.h> + +CTASSERT(ZERO_REGION_SIZE > TARFS_BLOCKSIZE); + +struct ustar_header { + char name[100]; /* File name */ + char mode[8]; /* Mode flags */ + char uid[8]; /* User id */ + char gid[8]; /* Group id */ + char size[12]; /* Size */ + char mtime[12]; /* Modified time */ + char checksum[8]; /* Checksum */ + char typeflag[1]; /* Type */ + char linkname[100]; /* "old format" stops here */ + char magic[6]; /* POSIX UStar "ustar\0" indicator */ + char version[2]; /* POSIX UStar version "00" */ + char uname[32]; /* User name */ + char gname[32]; /* Group name */ + char major[8]; /* Device major number */ + char minor[8]; /* Device minor number */ + char prefix[155]; /* Path prefix */ +}; + +#define TAR_EOF ((off_t)-1) + +#define TAR_TYPE_FILE '0' +#define TAR_TYPE_HARDLINK '1' +#define TAR_TYPE_SYMLINK '2' +#define TAR_TYPE_CHAR '3' +#define TAR_TYPE_BLOCK '4' +#define TAR_TYPE_DIRECTORY '5' +#define TAR_TYPE_FIFO '6' +#define TAR_TYPE_CONTIG '7' +#define TAR_TYPE_GLOBAL_EXTHDR 'g' +#define TAR_TYPE_EXTHDR 'x' +#define TAR_TYPE_GNU_SPARSE 'S' + +#define USTAR_MAGIC (uint8_t []){ 'u', 's', 't', 'a', 'r', 0 } +#define USTAR_VERSION (uint8_t []){ '0', '0' } +#define GNUTAR_MAGIC (uint8_t []){ 'u', 's', 't', 'a', 'r', ' ' } +#define GNUTAR_VERSION (uint8_t []){ ' ', '\x0' } + +#define DEFDIRMODE (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) + +MALLOC_DEFINE(M_TARFSMNT, "tarfs mount", "tarfs mount structures"); +MALLOC_DEFINE(M_TARFSNODE, "tarfs node", "tarfs node structures"); + +static vfs_mount_t tarfs_mount; +static vfs_unmount_t tarfs_unmount; +static vfs_root_t tarfs_root; +static vfs_statfs_t tarfs_statfs; +static vfs_fhtovp_t tarfs_fhtovp; + +static const char *tarfs_opts[] = { + "from", "gid", "mode", "uid", "verify", + NULL +}; + +/* + * Reads a len-width signed octal number from strp. Returns the value. + * XXX Does not report errors. + */ +static int64_t +tarfs_str2octal(const char *strp, size_t len) +{ + int64_t val; + size_t idx; + int sign; + + /* + * Skip leading spaces or tabs. + * XXX why? POSIX requires numeric fields to be 0-padded. + */ + for (idx = 0; idx < len; idx++) + if (strp[idx] != ' ' && strp[idx] != '\t') + break; + + if (idx == len) + return (0); + + if (strp[idx] == '-') { + sign = -1; + idx++; + } else + sign = 1; + + val = 0; + for (; idx < len; idx++) { + if (strp[idx] < '0' || strp[idx] > '7') + break; + val <<= 3; + val += (strp[idx] - '0'); + + /* Truncate on overflow */ + if (val > INT64_MAX / 8) { + val = INT64_MAX; + break; + } + } + + return (sign > 0) ? val : -val; +} + +/* + * Reads a len-byte extended numeric value from strp. The first byte has + * bit 7 set to indicate the format; the remaining 7 bits + the (len - 1) + * bytes that follow form a big-endian signed two's complement binary + * number. Returns the value. XXX Does not report errors. + */ +static int64_t +tarfs_str2base256(const char *strp, size_t len) +{ + int64_t val; + size_t idx; + + KASSERT(strp[0] & 0x80, ("not an extended numeric value")); + + /* Sign-extend the first byte */ + if ((strp[0] & 0x40) != 0) + val = (int64_t)-1; + else + val = 0; + val <<= 6; + val |= (strp[0] & 0x3f); + + /* Read subsequent bytes */ + for (idx = 1; idx < len; idx++) { + val <<= 8; + val |= (0xff & (int64_t)strp[idx]); + + /* Truncate on overflow and underflow */ + if (val > INT64_MAX / 256) { + val = INT64_MAX; + break; + } else if (val < INT64_MAX / 256) { + val = INT64_MIN; + break; + } + } + + return (val); +} + +/* + * Read a len-byte numeric field from strp. If bit 7 of the first byte it + * set, assume an extended numeric value (signed two's complement); + * otherwise, assume a signed octal value. + * + * XXX practically no error checking or handling + */ +static int64_t +tarfs_str2int64(const char *strp, size_t len) +{ + + if (len < 1) + return (0); + + if ((strp[0] & 0x80) != 0) + return (tarfs_str2base256(strp, len)); + return (tarfs_str2octal(strp, len)); +} + +/* + * Verifies the checksum of a header. Returns true if the checksum is + * valid, false otherwise. + */ +static boolean_t +tarfs_checksum(struct ustar_header *hdrp) +{ + const unsigned char *ptr; + int64_t checksum, hdrsum; + size_t idx; + + hdrsum = tarfs_str2int64(hdrp->checksum, sizeof(hdrp->checksum)); + TARFS_DPF(CHECKSUM, "%s: header checksum %lx\n", __func__, hdrsum); + + checksum = 0; + for (ptr = (const unsigned char *)hdrp; + ptr < (const unsigned char *)hdrp->checksum; ptr++) + checksum += *ptr; + for (idx = 0; idx < sizeof(hdrp->checksum); idx++) + checksum += 0x20; + for (ptr = (const unsigned char *)hdrp->typeflag; + ptr < (const unsigned char *)(hdrp + 1); ptr++) + checksum += *ptr; + TARFS_DPF(CHECKSUM, "%s: calc unsigned checksum %lx\n", __func__, + checksum); + if (hdrsum == checksum) + return (true); + + /* + * Repeat test with signed bytes, some older formats use a broken + * form of the calculation + */ + checksum = 0; + for (ptr = (const unsigned char *)hdrp; + ptr < (const unsigned char *)&hdrp->checksum; ptr++) + checksum += *((const signed char *)ptr); + for (idx = 0; idx < sizeof(hdrp->checksum); idx++) + checksum += 0x20; + for (ptr = (const unsigned char *)&hdrp->typeflag; + ptr < (const unsigned char *)(hdrp + 1); ptr++) + checksum += *((const signed char *)ptr); + TARFS_DPF(CHECKSUM, "%s: calc signed checksum %lx\n", __func__, + checksum); + if (hdrsum == checksum) + return (true); + + return (false); +} + + +/* + * Looks up a path in the tarfs node tree. + * + * - If the path exists, stores a pointer to the corresponding tarfs_node + * in retnode and a pointer to its parent in retparent. + * + * - If the path does not exist, but create_dirs is true, creates ancestor + * directories and returns NULL in retnode and the parent in retparent. + * + * - If the path does not exist and create_dirs is false, stops at the + * first missing path name component. + * + * - In all cases, on return, endp and sepp point to the beginning and + * end, respectively, of the last-processed path name component. + * + * - Returns 0 if the node was found, ENOENT if it was not, and some other + * positive errno value on failure. + */ +static int +tarfs_lookup_path(struct tarfs_mount *tmp, char *name, size_t namelen, + char **endp, char **sepp, struct tarfs_node **retparent, + struct tarfs_node **retnode, boolean_t create_dirs) +{ + struct componentname cn; + struct tarfs_node *parent, *tnp; + char *sep; + size_t len; + int error; + boolean_t do_lookup; + + MPASS(name != NULL && namelen != 0); + + do_lookup = true; + error = 0; + parent = tnp = tmp->root; + if (tnp == NULL) + panic("%s: root node not yet created", __func__); + + bzero(&cn, sizeof(cn)); + + TARFS_DPF(LOOKUP, "%s: Full path: %.*s\n", __func__, (int)namelen, + name); + + sep = NULL; + for (;;) { + /* skip leading slash(es) */ + while (name[0] == '/' && namelen > 0) + name++, namelen--; + + /* did we reach the end? */ + if (namelen == 0 || name[0] == '\0') { + name = do_lookup ? NULL : cn.cn_nameptr; + namelen = do_lookup ? 0 : cn.cn_namelen; + break; + } + + /* locate the next separator */ + for (sep = name, len = 0; + *sep != '\0' && *sep != '/' && len < namelen; + sep++, len++) + /* nothing */ ; + + /* check for . and .. */ + if (name[0] == '.' && len <= 2) { + if (len == 1) { + /* . */ + name += len; + namelen -= len; + continue; + } else if (name[1] == '.') { + /* .. */ + if (tnp == tmp->root) { + error = EINVAL; + break; + } + tnp = tnp->parent; + parent = tnp->parent; + name += len; + namelen -= len; + continue; + } + } + + /* create parent if necessary */ + if (!do_lookup) { + TARFS_DPF(ALLOC, "%s: creating %.*s\n", __func__, + (int)cn.cn_namelen, cn.cn_nameptr); + error = tarfs_alloc_node(tmp, cn.cn_nameptr, + cn.cn_namelen, VDIR, -1, 0, tmp->mtime, 0, 0, + DEFDIRMODE, 0, NULL, NODEV, parent, &tnp); + if (error != 0) + break; + } + + parent = tnp; + tnp = NULL; + cn.cn_nameptr = name; + cn.cn_namelen = len; + TARFS_DPF(LOOKUP, "%s: Search: %.*s\n", __func__, + (int)cn.cn_namelen, cn.cn_nameptr); + if (do_lookup) { + tnp = tarfs_lookup_node(parent, NULL, &cn); + if (tnp == NULL) { + do_lookup = false; + if (!create_dirs) + break; + } + } + name += cn.cn_namelen; + namelen -= cn.cn_namelen; + } + + TARFS_DPF(LOOKUP, "%s: Parent %p, node %p\n", __func__, parent, tnp); + + if (retparent) + *retparent = parent; + if (retnode) + *retnode = tnp; + if (endp) { + if (namelen > 0) + *endp = name; + else + *endp = NULL; + } + if (sepp) + *sepp = sep; + return (error); +} + +/* + * Frees a tarfs_mount structure and everything it references. + */ +static void +tarfs_free_mount(struct tarfs_mount *tmp) +{ + struct mount *mp; + struct tarfs_node *tnp; + + MPASS(tmp != NULL); + + TARFS_DPF(ALLOC, "%s: Freeing mount structure %p\n", __func__, tmp); + + TARFS_DPF(ALLOC, "%s: freeing tarfs_node structures\n", __func__); + while (!TAILQ_EMPTY(&tmp->allnodes)) { + tnp = TAILQ_FIRST(&tmp->allnodes); + TAILQ_REMOVE(&tmp->allnodes, tnp, entries); + tarfs_free_node(tnp); + } + + (void)tarfs_io_fini(tmp); + + TARFS_DPF(ALLOC, "%s: deleting unr header\n", __func__); + delete_unrhdr(tmp->ino_unr); + mp = tmp->vfs; + mp->mnt_data = NULL; + + TARFS_DPF(ALLOC, "%s: freeing structure\n", __func__); + free(tmp, M_TARFSMNT); +} + +/* + * Processes the tar file header at block offset blknump and allocates and + * populates a tarfs_node structure for the file it describes. Updated + * blknump to point to the next unread tar file block, or TAR_EOF if EOF + * is reached. Returns 0 on success or EOF and a positive errno value on + * failure. + */ +static int +tarfs_alloc_one(struct tarfs_mount *tmp, off_t *blknump) +{ + char block[TARFS_BLOCKSIZE]; + struct ustar_header *hdrp = (struct ustar_header *)block; + struct sbuf *namebuf = NULL; + char *exthdr = NULL, *name = NULL, *link = NULL; + off_t blknum = *blknump; + int endmarker = 0; + char *namep, *sep; + struct tarfs_node *parent, *tnp; + size_t namelen = 0, linklen = 0, realsize = 0, sz; + ssize_t res; + dev_t rdev; + gid_t gid; + mode_t mode; + time_t mtime; + uid_t uid; + long major = -1, minor = -1; + unsigned int flags = 0; + int error; + boolean_t sparse = false; + +again: + /* read next header */ + res = tarfs_io_read_buf(tmp, false, block, + TARFS_BLOCKSIZE * blknum, TARFS_BLOCKSIZE); + if (res < 0) { + error = -res; + goto bad; + } else if (res < TARFS_BLOCKSIZE) { + goto eof; + } + blknum++; + + /* check for end marker */ + if (memcmp(block, zero_region, TARFS_BLOCKSIZE) == 0) { + if (endmarker++) { + if (exthdr != NULL) { + TARFS_DPF(IO, "%s: orphaned extended header at %zu\n", + __func__, TARFS_BLOCKSIZE * (blknum - 1)); + free(exthdr, M_TEMP); + } + TARFS_DPF(IO, "%s: end of archive at %zu\n", __func__, + TARFS_BLOCKSIZE * blknum); + tmp->nblocks = blknum; + *blknump = TAR_EOF; + return (0); + } + goto again; + } + + /* verify magic */ + if (memcmp(hdrp->magic, USTAR_MAGIC, sizeof(USTAR_MAGIC)) == 0 && + memcmp(hdrp->version, USTAR_VERSION, sizeof(USTAR_VERSION)) == 0) { + /* POSIX */ + } else if (memcmp(hdrp->magic, GNUTAR_MAGIC, sizeof(GNUTAR_MAGIC)) == 0 && + memcmp(hdrp->magic, GNUTAR_MAGIC, sizeof(GNUTAR_MAGIC)) == 0) { + TARFS_DPF(ALLOC, "%s: GNU tar format at %zu\n", __func__, + TARFS_BLOCKSIZE * (blknum - 1)); + error = EFTYPE; + goto bad; + } else { + TARFS_DPF(ALLOC, "%s: unsupported TAR format at %zu\n", + __func__, TARFS_BLOCKSIZE * (blknum - 1)); + error = EINVAL; + goto bad; + } + + /* verify checksum */ + if (!tarfs_checksum(hdrp)) { + TARFS_DPF(ALLOC, "%s: header checksum failed at %zu\n", + __func__, TARFS_BLOCKSIZE * (blknum - 1)); + error = EINVAL; + goto bad; + } + + /* get standard attributes */ + mode = tarfs_str2int64(hdrp->mode, sizeof(hdrp->mode)); + uid = tarfs_str2int64(hdrp->uid, sizeof(hdrp->uid)); + gid = tarfs_str2int64(hdrp->gid, sizeof(hdrp->gid)); + sz = tarfs_str2int64(hdrp->size, sizeof(hdrp->size)); + mtime = tarfs_str2int64(hdrp->mtime, sizeof(hdrp->mtime)); + rdev = NODEV; + TARFS_DPF(ALLOC, "%s: [%c] %zu @%jd %o %d:%d\n", __func__, + hdrp->typeflag[0], sz, (intmax_t)mtime, mode, uid, gid); + + /* extended header? */ + if (hdrp->typeflag[0] == TAR_TYPE_GLOBAL_EXTHDR) { + printf("%s: unsupported global extended header at %zd\n", + __func__, TARFS_BLOCKSIZE * (blknum - 1)); + error = EFTYPE; + goto bad; + } + if (hdrp->typeflag[0] == TAR_TYPE_EXTHDR) { + if (exthdr != NULL) { + TARFS_DPF(IO, "%s: multiple extended headers at %zu\n", + __func__, TARFS_BLOCKSIZE * (blknum - 1)); + error = EFTYPE; + goto bad; + } + /* read the contents of the exthdr */ + TARFS_DPF(ALLOC, "%s: %zu-byte extended header at %zd\n", + __func__, sz, TARFS_BLOCKSIZE * (blknum - 1)); + exthdr = malloc(sz, M_TEMP, M_WAITOK); + res = tarfs_io_read_buf(tmp, false, exthdr, + TARFS_BLOCKSIZE * blknum, sz); + if (res < 0) { + error = -res; + goto bad; + } + if (res < sz) { + goto eof; + } + blknum += TARFS_SZ2BLKS(res); + /* XXX TODO: refactor this parser */ + char *line = exthdr; + while (line < exthdr + sz) { + char *eol, *key, *value, *sep; + size_t len = strtoul(line, &sep, 10); + if (len == 0 || sep == line || *sep != ' ') { + TARFS_DPF(ALLOC, "%s: exthdr syntax error\n", + __func__); + error = EINVAL; + goto bad; + } + if (line + len > exthdr + sz) { + TARFS_DPF(ALLOC, "%s: exthdr overflow\n", + __func__); + error = EINVAL; + goto bad; + } + eol = line + len - 1; + *eol = '\0'; + line += len; + key = sep + 1; + sep = strchr(key, '='); + if (sep == NULL) { + TARFS_DPF(ALLOC, "%s: exthdr syntax error\n", + __func__); + error = EINVAL; + goto bad; + } + *sep = '\0'; + value = sep + 1; + TARFS_DPF(ALLOC, "%s: exthdr %s=%s\n", __func__, + key, value); + if (strcmp(key, "linkpath") == 0) { + link = value; + linklen = eol - value; + } else if (strcmp(key, "GNU.sparse.major") == 0) { + sparse = true; + major = strtol(value, &sep, 10); + if (sep != eol) { + printf("exthdr syntax error\n"); + error = EINVAL; + goto bad; + } + } else if (strcmp(key, "GNU.sparse.minor") == 0) { + sparse = true; + minor = strtol(value, &sep, 10); + if (sep != eol) { + printf("exthdr syntax error\n"); + error = EINVAL; + goto bad; + } + } else if (strcmp(key, "GNU.sparse.name") == 0) { + sparse = true; + name = value; + namelen = eol - value; + if (namelen == 0) { + printf("exthdr syntax error\n"); + error = EINVAL; + goto bad; + } + } else if (strcmp(key, "GNU.sparse.realsize") == 0) { + sparse = true; + realsize = strtoul(value, &sep, 10); + if (sep != eol) { + printf("exthdr syntax error\n"); + error = EINVAL; + goto bad; + } + } else if (strcmp(key, "SCHILY.fflags") == 0) { + flags |= tarfs_strtofflags(value, &sep); + if (sep != eol) { + printf("exthdr syntax error\n"); + error = EINVAL; + goto bad; + } + } + } + goto again; + } + + /* sparse file consistency checks */ + if (sparse) { + TARFS_DPF(ALLOC, "%s: %s: sparse %ld.%ld (%zu bytes)\n", __func__, + name, major, minor, realsize); + if (major != 1 || minor != 0 || name == NULL || realsize == 0 || + hdrp->typeflag[0] != TAR_TYPE_FILE) { + TARFS_DPF(ALLOC, "%s: invalid sparse format\n", __func__); + error = EINVAL; + goto bad; + } + } + + /* file name */ + if (name == NULL) { + if (hdrp->prefix[0] != '\0') { + namebuf = sbuf_new_auto(); + sbuf_printf(namebuf, "%.*s/%.*s", + (int)sizeof(hdrp->prefix), hdrp->prefix, + (int)sizeof(hdrp->name), hdrp->name); + sbuf_finish(namebuf); + name = sbuf_data(namebuf); + namelen = sbuf_len(namebuf); + } else { + name = hdrp->name; + namelen = strnlen(hdrp->name, sizeof(hdrp->name)); + } + } + + error = tarfs_lookup_path(tmp, name, namelen, &namep, + &sep, &parent, &tnp, true); + if (error != 0) + goto bad; + if (tnp != NULL) { + if (hdrp->typeflag[0] == TAR_TYPE_DIRECTORY) { + /* XXX set attributes? */ + goto skip; + } + TARFS_DPF(ALLOC, "%s: duplicate file %.*s\n", __func__, + (int)namelen, name); + error = EINVAL; + goto bad; + } + switch (hdrp->typeflag[0]) { + case TAR_TYPE_DIRECTORY: + error = tarfs_alloc_node(tmp, namep, sep - namep, VDIR, + 0, 0, mtime, uid, gid, mode, flags, NULL, 0, + parent, &tnp); + break; + case TAR_TYPE_FILE: + error = tarfs_alloc_node(tmp, namep, sep - namep, VREG, + blknum * TARFS_BLOCKSIZE, sz, mtime, uid, gid, mode, + flags, NULL, 0, parent, &tnp); + if (error == 0 && sparse) { + error = tarfs_load_blockmap(tnp, realsize); + } + break; + case TAR_TYPE_HARDLINK: + if (link == NULL) { + link = hdrp->linkname; + linklen = strnlen(link, sizeof(hdrp->linkname)); + } + error = tarfs_alloc_node(tmp, namep, sep - namep, VREG, + 0, 0, 0, 0, 0, 0, 0, NULL, 0, parent, &tnp); + if (error != 0) { + goto bad; + } + error = tarfs_lookup_path(tmp, link, linklen, NULL, + NULL, NULL, &tnp->other, false); + if (tnp->other == NULL || + tnp->other->type != VREG || + tnp->other->other != NULL) { + TARFS_DPF(ALLOC, "%s: %.*s: dead hard link to %.*s\n", + __func__, (int)namelen, name, (int)linklen, link); + error = EINVAL; + goto bad; + } + break; + case TAR_TYPE_SYMLINK: + if (link == NULL) { + link = hdrp->linkname; + linklen = strnlen(link, sizeof(hdrp->linkname)); + } + error = tarfs_alloc_node(tmp, namep, sep - namep, VLNK, + 0, linklen, mtime, uid, gid, mode, flags, link, 0, + parent, &tnp); + break; + case TAR_TYPE_BLOCK: + major = tarfs_str2int64(hdrp->major, sizeof(hdrp->major)); + minor = tarfs_str2int64(hdrp->minor, sizeof(hdrp->minor)); + rdev = makedev(major, minor); + error = tarfs_alloc_node(tmp, namep, sep - namep, VBLK, + 0, 0, mtime, uid, gid, mode, flags, NULL, rdev, + parent, &tnp); + break; + case TAR_TYPE_CHAR: + major = tarfs_str2int64(hdrp->major, sizeof(hdrp->major)); + minor = tarfs_str2int64(hdrp->minor, sizeof(hdrp->minor)); + rdev = makedev(major, minor); + error = tarfs_alloc_node(tmp, namep, sep - namep, VCHR, + 0, 0, mtime, uid, gid, mode, flags, NULL, rdev, + parent, &tnp); + break; + default: + TARFS_DPF(ALLOC, "%s: unsupported type %c for %.*s\n", + __func__, hdrp->typeflag[0], (int)namelen, name); + error = EINVAL; + break; + } + if (error != 0) + goto bad; + +skip: + blknum += TARFS_SZ2BLKS(sz); + tmp->nblocks = blknum; + *blknump = blknum; + if (exthdr != NULL) { + free(exthdr, M_TEMP); + } + if (namebuf != NULL) { + sbuf_delete(namebuf); + } + return (0); +eof: + TARFS_DPF(IO, "%s: premature end of file\n", __func__); + error = EIO; + goto bad; +bad: + if (exthdr != NULL) { + free(exthdr, M_TEMP); + } + if (namebuf != NULL) { + sbuf_delete(namebuf); + } + return (error); +} + +/* + * Allocates and populates the metadata structures for the tar file + * referenced by vp. On success, a pointer to the tarfs_mount structure + * is stored in tmpp. Returns 0 on success or a positive errno value on + * failure. + */ +static int +tarfs_alloc_mount(struct mount *mp, struct vnode *vp, + uid_t root_uid, gid_t root_gid, mode_t root_mode, + struct tarfs_mount **tmpp) +{ + struct vattr va; + struct thread *td = curthread; + char *fullpath; + struct tarfs_mount *tmp; + struct tarfs_node *root; + off_t blknum; + time_t mtime; + int error; + + KASSERT(tmpp != NULL, ("tarfs mount return is NULL")); + ASSERT_VOP_LOCKED(vp, __func__); + + tmp = NULL; + fullpath = NULL; + + TARFS_DPF(ALLOC, "%s: Allocating tarfs mount structure for vp %p\n", + __func__, vp); + + /* Get source metadata */ + error = VOP_GETATTR(vp, &va, td->td_ucred); + if (error != 0) { + return (error); + } + VOP_UNLOCK(vp); + mtime = va.va_mtime.tv_sec; + + /* Allocate and initialize tarfs mount structure */ + tmp = (struct tarfs_mount *)malloc(sizeof(struct tarfs_mount), + M_TARFSMNT, M_WAITOK | M_ZERO); + TARFS_DPF(ALLOC, "%s: Allocated mount structure\n", __func__); + mp->mnt_data = tmp; + + mtx_init(&tmp->allnode_lock, "tarfs allnode lock", NULL, + MTX_DEF); + TAILQ_INIT(&tmp->allnodes); + tmp->ino_unr = new_unrhdr(TARFS_MININO, INT_MAX, &tmp->allnode_lock); + tmp->vp = vp; + tmp->vfs = mp; + tmp->mtime = mtime; + + /* + * XXX The decompression layer passes everything through the + * buffer cache, and the buffer cache wants to know our blocksize, + * but mnt_stat normally isn't populated until after we return, so + * we have to cheat a bit. + */ + tmp->iosize = 1U << tarfs_ioshift; + mp->mnt_stat.f_iosize = tmp->iosize; + + /* Initialize decompression layer */ + error = tarfs_io_init(tmp); + if (error != 0) + goto bad; + + error = tarfs_alloc_node(tmp, NULL, 0, VDIR, 0, 0, mtime, root_uid, + root_gid, root_mode & ALLPERMS, 0, NULL, NODEV, NULL, &root); + if (error != 0 || root == NULL) + goto bad; + tmp->root = root; + + blknum = 0; + do { + if ((error = tarfs_alloc_one(tmp, &blknum)) != 0) { + goto bad; + } + } while (blknum != TAR_EOF); + + *tmpp = tmp; + + TARFS_DPF(ALLOC, "%s: pfsmnt_root %p\n", __func__, tmp->root); + return (0); + +bad: + if (tmp != NULL) + tarfs_free_mount(tmp); + free(fullpath, M_TEMP); + return (error); +} + +/* + * VFS Operations. + */ + +static int +tarfs_mount(struct mount *mp) +{ + struct nameidata nd; + struct vattr va; + struct tarfs_mount *tmp = NULL; + struct thread *td = curthread; + struct vnode *vp; + char *from; + uid_t root_uid; + gid_t root_gid; + mode_t root_mode; + int error, flags, len; + + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + if (vfs_filteropt(mp->mnt_optnew, tarfs_opts)) + return (EINVAL); + + vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred); + VOP_UNLOCK(mp->mnt_vnodecovered); + if (error) + return (error); + + if (mp->mnt_cred->cr_ruid != 0 || + vfs_scanopt(mp->mnt_optnew, "gid", "%d", &root_gid) != 1) + root_gid = va.va_gid; + if (mp->mnt_cred->cr_ruid != 0 || + vfs_scanopt(mp->mnt_optnew, "uid", "%d", &root_uid) != 1) + root_uid = va.va_uid; + if (mp->mnt_cred->cr_ruid != 0 || + vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1) + root_mode = va.va_mode; + + error = vfs_getopt(mp->mnt_optnew, "from", (void **)&from, &len); + if (error != 0 || from[len - 1] != '\0') + return (EINVAL); + + /* Find the source tarball */ + TARFS_DPF(FS, "%s(%s, uid=%u, gid=%u, mode=%o)\n", __func__, + from, root_uid, root_gid, root_mode); + flags = FREAD; + if (vfs_flagopt(mp->mnt_optnew, "verify", NULL, 0)) { + flags |= O_VERIFY; + } + NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF, UIO_SYSSPACE, from); + error = namei(&nd); + if (error != 0) + return (error); + NDFREE_PNBUF(&nd); + vp = nd.ni_vp; + TARFS_DPF(FS, "%s: N: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + /* vp is now held and locked */ + + /* Open the source tarball */ + error = vn_open_vnode(vp, flags, td->td_ucred, td, NULL); + if (error != 0) { + TARFS_DPF(FS, "%s: failed to open %s: %d\n", __func__, + from, error); + vput(vp); + goto bad; + } + TARFS_DPF(FS, "%s: O: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + if (vp->v_type != VREG) { + TARFS_DPF(FS, "%s: not a regular file\n", __func__); + error = EOPNOTSUPP; + goto bad_open_locked; + } + error = priv_check(td, PRIV_VFS_MOUNT_PERM); + if (error != 0) { + TARFS_DPF(FS, "%s: not permitted to mount\n", __func__); + goto bad_open_locked; + } + if (flags & O_VERIFY) { + mp->mnt_flag |= MNT_VERIFIED; + } + + /* Allocate the tarfs mount */ + error = tarfs_alloc_mount(mp, vp, root_uid, root_gid, root_mode, &tmp); + /* vp is now held but unlocked */ + if (error != 0) { + TARFS_DPF(FS, "%s: failed to mount %s: %d\n", __func__, + from, error); + goto bad_open_unlocked; + } + TARFS_DPF(FS, "%s: M: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + + /* Unconditionally mount as read-only */ + MNT_ILOCK(mp); + mp->mnt_flag |= (MNT_LOCAL | MNT_RDONLY); + MNT_IUNLOCK(mp); + + vfs_getnewfsid(mp); + vfs_mountedfrom(mp, "tarfs"); + TARFS_DPF(FS, "%s: success\n", __func__); + + return (0); + +bad_open_locked: + /* vp must be held and locked */ + TARFS_DPF(FS, "%s: L: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + VOP_UNLOCK(vp); +bad_open_unlocked: + /* vp must be held and unlocked */ + TARFS_DPF(FS, "%s: E: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + (void)vn_close(vp, flags, td->td_ucred, td); +bad: + /* vp must be released and unlocked */ + TARFS_DPF(FS, "%s: X: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + return (error); +} + +/* + * Unmounts a tarfs filesystem. + */ +static int +tarfs_unmount(struct mount *mp, int mntflags) +{ + struct thread *td = curthread; + struct tarfs_mount *tmp; + struct vnode *vp; + int error; + int flags = 0; + + TARFS_DPF(FS, "%s: Unmounting %p\n", __func__, mp); + + /* Handle forced unmounts */ + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + + /* Finalize all pending I/O */ + error = vflush(mp, 0, flags, curthread); + if (error != 0) + return (error); + tmp = MP_TO_TARFS_MOUNT(mp); + vp = tmp->vp; + + MPASS(vp != NULL); + TARFS_DPF(FS, "%s: U: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + vn_close(vp, FREAD, td->td_ucred, td); + TARFS_DPF(FS, "%s: C: hold %u use %u lock 0x%x\n", __func__, + vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp)); + tarfs_free_mount(tmp); + + return (0); +} + +/* + * Gets the root of a tarfs filesystem. Returns 0 on success or a + * positive errno value on failure. + */ +static int +tarfs_root(struct mount *mp, int flags, struct vnode **vpp) +{ + struct vnode *nvp; + int error; + + TARFS_DPF(FS, "%s: Getting root vnode\n", __func__); + + error = VFS_VGET(mp, TARFS_ROOTINO, LK_EXCLUSIVE, &nvp); + if (error != 0) + return (error); + + nvp->v_vflag |= VV_ROOT; + *vpp = nvp; + return (0); +} + +/* + * Gets statistics for a tarfs filesystem. Returns 0. + */ +static int +tarfs_statfs(struct mount *mp, struct statfs *sbp) +{ + struct tarfs_mount *tmp; + + tmp = MP_TO_TARFS_MOUNT(mp); + + sbp->f_bsize = TARFS_BLOCKSIZE; + sbp->f_iosize = tmp->iosize; + sbp->f_blocks = tmp->nblocks; + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = tmp->nfiles; + sbp->f_ffree = 0; + + return (0); +} + +/* + * Gets a vnode for the given inode. On success, a pointer to the vnode + * is stored in vpp. Returns 0 on success or a positive errno value on + * failure. + */ +static int +tarfs_vget(struct mount *mp, ino_t ino, int lkflags, struct vnode **vpp) +{ + struct tarfs_mount *tmp; + struct tarfs_node *tnp; + struct thread *td; + struct vnode *vp; + int error; + + TARFS_DPF(FS, "%s: mp %p, ino %lu, lkflags %d\n", __func__, mp, ino, + lkflags); + + td = curthread; + error = vfs_hash_get(mp, ino, lkflags, td, vpp, NULL, NULL); + if (error != 0) + return (error); + + if (*vpp != NULL) { + TARFS_DPF(FS, "%s: found hashed vnode %p\n", __func__, *vpp); + return (error); + } + + TARFS_DPF(FS, "%s: no hashed vnode for inode %lu\n", __func__, ino); + + tmp = MP_TO_TARFS_MOUNT(mp); + + if (ino == TARFS_ZIOINO) { + error = vget(tmp->znode, lkflags); + if (error != 0) + return (error); + *vpp = tmp->znode; + return (0); + } + + /* XXX Should use hash instead? */ + TAILQ_FOREACH(tnp, &tmp->allnodes, entries) { + if (tnp->ino == ino) + break; + } + TARFS_DPF(FS, "%s: search of all nodes found %p\n", __func__, tnp); + if (tnp == NULL) + return (ENOENT); + + error = getnewvnode("tarfs", mp, &tarfs_vnodeops, &vp); + if (error != 0) + goto bad; + TARFS_DPF(FS, "%s: allocated vnode\n", __func__); + vp->v_data = tnp; + vp->v_type = tnp->type; + tnp->vnode = vp; + + lockmgr(vp->v_vnlock, lkflags, NULL); + error = insmntque(vp, mp); + if (error != 0) + goto bad; + TARFS_DPF(FS, "%s: inserting entry into VFS hash\n", __func__); + error = vfs_hash_insert(vp, ino, lkflags, td, vpp, NULL, NULL); + if (error != 0 || *vpp != NULL) + return (error); + + vn_set_state(vp, VSTATE_CONSTRUCTED); + *vpp = vp; + return (0); + +bad: + *vpp = NULLVP; + return (error); +} + +static int +tarfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) +{ + struct tarfs_node *tnp; + struct tarfs_fid *tfp; + struct vnode *nvp; + int error; + + tfp = (struct tarfs_fid *)fhp; + MP_TO_TARFS_MOUNT(mp); + if (tfp->ino < TARFS_ROOTINO || tfp->ino > INT_MAX) + return (ESTALE); + + error = VFS_VGET(mp, tfp->ino, LK_EXCLUSIVE, &nvp); + if (error != 0) { + *vpp = NULLVP; + return (error); + } + tnp = VP_TO_TARFS_NODE(nvp); + if (tnp->mode == 0 || + tnp->gen != tfp->gen || + tnp->nlink <= 0) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + return (0); +} + +static struct vfsops tarfs_vfsops = { + .vfs_fhtovp = tarfs_fhtovp, + .vfs_mount = tarfs_mount, + .vfs_root = tarfs_root, + .vfs_statfs = tarfs_statfs, + .vfs_unmount = tarfs_unmount, + .vfs_vget = tarfs_vget, +}; +VFS_SET(tarfs_vfsops, tarfs, VFCF_READONLY); +MODULE_VERSION(tarfs, 1); +MODULE_DEPEND(tarfs, xz, 1, 1, 1); diff --git a/sys/fs/tarfs/tarfs_vnops.c b/sys/fs/tarfs/tarfs_vnops.c new file mode 100644 index 000000000000..a40499982229 --- /dev/null +++ b/sys/fs/tarfs/tarfs_vnops.c @@ -0,0 +1,642 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Juniper Networks, Inc. + * Copyright (c) 2022-2023 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_tarfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/dirent.h> +#include <sys/fcntl.h> +#include <sys/limits.h> +#include <sys/mount.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/vnode.h> + +#include <fs/tarfs/tarfs.h> +#include <fs/tarfs/tarfs_dbg.h> + +static int +tarfs_open(struct vop_open_args *ap) +{ + struct tarfs_node *tnp; + struct vnode *vp; + + vp = ap->a_vp; + MPASS(VOP_ISLOCKED(vp)); + tnp = VP_TO_TARFS_NODE(vp); + + TARFS_DPF(VNODE, "%s(%p=%s, %o)\n", __func__, + tnp, tnp->name, ap->a_mode); + + if (vp->v_type != VREG && vp->v_type != VDIR) + return (EOPNOTSUPP); + + vnode_create_vobject(vp, tnp->size, ap->a_td); + return (0); +} + +static int +tarfs_close(struct vop_close_args *ap) +{ +#ifdef TARFS_DEBUG + struct tarfs_node *tnp; + struct vnode *vp; + + vp = ap->a_vp; + + MPASS(VOP_ISLOCKED(vp)); + tnp = VP_TO_TARFS_NODE(vp); + + TARFS_DPF(VNODE, "%s(%p=%s)\n", __func__, + tnp, tnp->name); +#else + (void)ap; +#endif + return (0); +} + +static int +tarfs_access(struct vop_access_args *ap) +{ + struct tarfs_node *tnp; + struct vnode *vp; + accmode_t accmode; + struct ucred *cred; + int error; + + vp = ap->a_vp; + accmode = ap->a_accmode; + cred = ap->a_cred; + + MPASS(VOP_ISLOCKED(vp)); + tnp = VP_TO_TARFS_NODE(vp); + + TARFS_DPF(VNODE, "%s(%p=%s, %o)\n", __func__, + tnp, tnp->name, accmode); + + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if ((accmode & VWRITE) != 0) + return (EROFS); + break; + case VBLK: + case VCHR: + case VFIFO: + break; + default: + return (EINVAL); + } + + if ((accmode & VWRITE) != 0) + return (EPERM); + + error = vaccess(vp->v_type, tnp->mode, tnp->uid, + tnp->gid, accmode, cred); + return (error); +} + +static int +tarfs_getattr(struct vop_getattr_args *ap) +{ + struct tarfs_node *tnp; + struct vnode *vp; + struct vattr *vap; + + vp = ap->a_vp; + vap = ap->a_vap; + tnp = VP_TO_TARFS_NODE(vp); + + TARFS_DPF(VNODE, "%s(%p=%s)\n", __func__, + tnp, tnp->name); + + vap->va_type = vp->v_type; + vap->va_mode = tnp->mode; + vap->va_nlink = tnp->nlink; + vap->va_gid = tnp->gid; + vap->va_uid = tnp->uid; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_fileid = tnp->ino; + vap->va_size = tnp->size; + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_atime = tnp->atime; + vap->va_ctime = tnp->ctime; + vap->va_mtime = tnp->mtime; + vap->va_birthtime = tnp->birthtime; + vap->va_gen = tnp->gen; + vap->va_flags = tnp->flags; + vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? + tnp->rdev : NODEV; + vap->va_bytes = round_page(tnp->physize); + vap->va_filerev = 0; + + return (0); +} + +static int +tarfs_lookup(struct vop_cachedlookup_args *ap) +{ + struct tarfs_node *dirnode, *parent, *tnp; + struct componentname *cnp; + struct vnode *dvp, **vpp; +#ifdef TARFS_DEBUG + struct vnode *vp; +#endif + int error; + + dvp = ap->a_dvp; + vpp = ap->a_vpp; + cnp = ap->a_cnp; + + *vpp = NULLVP; + dirnode = VP_TO_TARFS_NODE(dvp); + parent = dirnode->parent; + tnp = NULL; + + TARFS_DPF(LOOKUP, "%s(%p=%s, %.*s)\n", __func__, + dirnode, dirnode->name, + (int)cnp->cn_namelen, cnp->cn_nameptr); + + error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, curthread); + if (error != 0) + return (error); + + if (cnp->cn_flags & ISDOTDOT) { + /* Do not allow .. on the root node */ + if (parent == NULL || parent == dirnode) + return (ENOENT); + + /* Allocate a new vnode on the matching entry */ + error = vn_vget_ino(dvp, parent->ino, cnp->cn_lkflags, + vpp); + if (error != 0) + return (error); + } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { + VREF(dvp); + *vpp = dvp; +#ifdef TARFS_DEBUG + } else if (dirnode == dirnode->tmp->root && + (vp = dirnode->tmp->znode) != NULL && + cnp->cn_namelen == TARFS_ZIO_NAMELEN && + memcmp(cnp->cn_nameptr, TARFS_ZIO_NAME, TARFS_ZIO_NAMELEN) == 0) { + error = vn_lock(vp, cnp->cn_lkflags); + if (error != 0) + return (error); + vref(vp); + *vpp = vp; + return (0); +#endif + } else { + tnp = tarfs_lookup_node(dirnode, NULL, cnp); + if (tnp == NULL) { + TARFS_DPF(LOOKUP, "%s(%p=%s, %.*s): file not found\n", __func__, + dirnode, dirnode->name, + (int)cnp->cn_namelen, cnp->cn_nameptr); + return (ENOENT); + } + + if ((cnp->cn_flags & ISLASTCN) == 0 && + (tnp->type != VDIR && tnp->type != VLNK)) + return (ENOTDIR); + + error = vn_vget_ino(dvp, tnp->ino, cnp->cn_lkflags, vpp); + if (error != 0) + return (error); + } + +#ifdef TARFS_DEBUG + if (tnp == NULL) + tnp = VP_TO_TARFS_NODE(*vpp); + TARFS_DPF(LOOKUP, "%s: found vnode %p, tarfs_node %p\n", __func__, + *vpp, tnp); +#endif /* TARFS_DEBUG */ + + /* Store the result the the cache if MAKEENTRY is specified in flags */ + if ((cnp->cn_flags & MAKEENTRY) != 0 && cnp->cn_nameiop != CREATE) + cache_enter(dvp, *vpp, cnp); + + return (error); +} + +static int +tarfs_readdir(struct vop_readdir_args *ap) +{ + struct dirent cde; + struct tarfs_node *current, *tnp; + struct vnode *vp; + struct uio *uio; + int *eofflag; + u_long **cookies; + int *ncookies; + off_t off; + u_int idx, ndirents; + int error; + + vp = ap->a_vp; + uio = ap->a_uio; + eofflag = ap->a_eofflag; + cookies = ap->a_cookies; + ncookies = ap->a_ncookies; + + if (vp->v_type != VDIR) + return (ENOTDIR); + + tnp = VP_TO_TARFS_NODE(vp); + off = uio->uio_offset; + current = NULL; + ndirents = 0; + + TARFS_DPF(VNODE, "%s(%p=%s, %zu, %zd)\n", __func__, + tnp, tnp->name, uio->uio_offset, uio->uio_resid); + + if (uio->uio_offset == TARFS_COOKIE_EOF) { + TARFS_DPF(VNODE, "%s: EOF\n", __func__); + return (0); + } + + if (uio->uio_offset == TARFS_COOKIE_DOT) { + TARFS_DPF(VNODE, "%s: Generating . entry\n", __func__); + /* fake . entry */ + cde.d_fileno = tnp->ino; + cde.d_type = DT_DIR; + cde.d_namlen = 1; + cde.d_name[0] = '.'; + cde.d_name[1] = '\0'; + cde.d_reclen = GENERIC_DIRSIZ(&cde); + if (cde.d_reclen > uio->uio_resid) + goto full; + error = uiomove(&cde, cde.d_reclen, uio); + if (error) + return (error); + /* next is .. */ + uio->uio_offset = TARFS_COOKIE_DOTDOT; + ndirents++; + } + + if (uio->uio_offset == TARFS_COOKIE_DOTDOT) { + TARFS_DPF(VNODE, "%s: Generating .. entry\n", __func__); + /* fake .. entry */ + MPASS(tnp->parent != NULL); + TARFS_NODE_LOCK(tnp->parent); + cde.d_fileno = tnp->parent->ino; + TARFS_NODE_UNLOCK(tnp->parent); + cde.d_type = DT_DIR; + cde.d_namlen = 2; + cde.d_name[0] = '.'; + cde.d_name[1] = '.'; + cde.d_name[2] = '\0'; + cde.d_reclen = GENERIC_DIRSIZ(&cde); + if (cde.d_reclen > uio->uio_resid) + goto full; + error = uiomove(&cde, cde.d_reclen, uio); + if (error) + return (error); + /* next is first child */ + current = TAILQ_FIRST(&tnp->dir.dirhead); + if (current == NULL) + goto done; + uio->uio_offset = current->ino; + TARFS_DPF(VNODE, "%s: [%u] setting current node to %p=%s\n", + __func__, ndirents, current, current->name); + ndirents++; + } + + /* resuming previous call */ + if (current == NULL) { + current = tarfs_lookup_dir(tnp, uio->uio_offset); + if (current == NULL) { + error = EINVAL; + goto done; + } + uio->uio_offset = current->ino; + TARFS_DPF(VNODE, "%s: [%u] setting current node to %p=%s\n", + __func__, ndirents, current, current->name); + } + + for (;;) { + cde.d_fileno = current->ino; + switch (current->type) { + case VBLK: + cde.d_type = DT_BLK; + break; + case VCHR: + cde.d_type = DT_CHR; + break; + case VDIR: + cde.d_type = DT_DIR; + break; + case VFIFO: + cde.d_type = DT_FIFO; + break; + case VLNK: + cde.d_type = DT_LNK; + break; + case VREG: + cde.d_type = DT_REG; + break; + default: + panic("%s: tarfs_node %p, type %d\n", __func__, + current, current->type); + } + cde.d_namlen = current->namelen; + MPASS(tnp->namelen < sizeof(cde.d_name)); + (void)memcpy(cde.d_name, current->name, current->namelen); + cde.d_name[current->namelen] = '\0'; + cde.d_reclen = GENERIC_DIRSIZ(&cde); + if (cde.d_reclen > uio->uio_resid) + goto full; + error = uiomove(&cde, cde.d_reclen, uio); + if (error != 0) + goto done; + ndirents++; + /* next sibling */ + current = TAILQ_NEXT(current, dirents); + if (current == NULL) + goto done; + uio->uio_offset = current->ino; + TARFS_DPF(VNODE, "%s: [%u] setting current node to %p=%s\n", + __func__, ndirents, current, current->name); + } +full: + if (cde.d_reclen > uio->uio_resid) { + TARFS_DPF(VNODE, "%s: out of space, returning\n", + __func__); + error = (ndirents == 0) ? EINVAL : 0; + } +done: + TARFS_DPF(VNODE, "%s: %u entries written\n", __func__, ndirents); + TARFS_DPF(VNODE, "%s: saving cache information\n", __func__); + if (current == NULL) { + uio->uio_offset = TARFS_COOKIE_EOF; + tnp->dir.lastcookie = 0; + tnp->dir.lastnode = NULL; + } else { + tnp->dir.lastcookie = current->ino; + tnp->dir.lastnode = current; + } + + if (eofflag != NULL) { + TARFS_DPF(VNODE, "%s: Setting EOF flag\n", __func__); + *eofflag = (error == 0 && current == NULL); + } + + /* Update for NFS */ + if (error == 0 && cookies != NULL && ncookies != NULL) { + TARFS_DPF(VNODE, "%s: Updating NFS cookies\n", __func__); + current = NULL; + *cookies = malloc(ndirents * sizeof(off_t), M_TEMP, M_WAITOK); + *ncookies = ndirents; + for (idx = 0; idx < ndirents; idx++) { + if (off == TARFS_COOKIE_DOT) + off = TARFS_COOKIE_DOTDOT; + else { + if (off == TARFS_COOKIE_DOTDOT) { + current = TAILQ_FIRST(&tnp->dir.dirhead); + } else if (current != NULL) { + current = TAILQ_NEXT(current, dirents); + } else { + current = tarfs_lookup_dir(tnp, off); + current = TAILQ_NEXT(current, dirents); + } + if (current == NULL) + off = TARFS_COOKIE_EOF; + else + off = current->ino; + } + + TARFS_DPF(VNODE, "%s: [%u] offset %zu\n", __func__, + idx, off); + (*cookies)[idx] = off; + } + MPASS(uio->uio_offset == off); + } + + return (error); +} + +static int +tarfs_read(struct vop_read_args *ap) +{ + struct tarfs_node *tnp; + struct uio *uiop; + struct vnode *vp; + size_t len; + off_t resid; + int error; + + uiop = ap->a_uio; + vp = ap->a_vp; + + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (EOPNOTSUPP); + + if (vp->v_type != VREG) + return (EISDIR); + + if (uiop->uio_offset < 0) + return (EINVAL); + + tnp = VP_TO_TARFS_NODE(vp); + error = 0; + + TARFS_DPF(VNODE, "%s(%p=%s, %zu, %zd)\n", __func__, + tnp, tnp->name, uiop->uio_offset, uiop->uio_resid); + + while ((resid = uiop->uio_resid) > 0) { + if (tnp->size <= uiop->uio_offset) + break; + len = MIN(tnp->size - uiop->uio_offset, resid); + if (len == 0) + break; + + error = tarfs_read_file(tnp, len, uiop); + if (error != 0 || resid == uiop->uio_resid) + break; + } + + return (error); +} + +static int +tarfs_readlink(struct vop_readlink_args *ap) +{ + struct tarfs_node *tnp; + struct uio *uiop; + struct vnode *vp; + int error; + + uiop = ap->a_uio; + vp = ap->a_vp; + + MPASS(uiop->uio_offset == 0); + MPASS(vp->v_type == VLNK); + + tnp = VP_TO_TARFS_NODE(vp); + + TARFS_DPF(VNODE, "%s(%p=%s)\n", __func__, + tnp, tnp->name); + + error = uiomove(tnp->link.name, + MIN(tnp->size, uiop->uio_resid), uiop); + + return (error); +} + +static int +tarfs_reclaim(struct vop_reclaim_args *ap) +{ + struct tarfs_node *tnp; + struct vnode *vp; + + vp = ap->a_vp; + tnp = VP_TO_TARFS_NODE(vp); + + vfs_hash_remove(vp); + vnode_destroy_vobject(vp); + cache_purge(vp); + + TARFS_NODE_LOCK(tnp); + tnp->vnode = NULLVP; + vp->v_data = NULL; + TARFS_NODE_UNLOCK(tnp); + + return (0); +} + +static int +tarfs_print(struct vop_print_args *ap) +{ + struct tarfs_node *tnp; + struct vnode *vp; + + vp = ap->a_vp; + tnp = VP_TO_TARFS_NODE(vp); + + printf("tag tarfs, tarfs_node %p, links %lu\n", + tnp, tnp->nlink); + printf("\tmode 0%o, owner %d, group %d, size %zd\n", + tnp->mode, tnp->uid, tnp->gid, + tnp->size); + + if (vp->v_type == VFIFO) + fifo_printinfo(vp); + + printf("\n"); + + return (0); +} + +static int +tarfs_strategy(struct vop_strategy_args *ap) +{ + struct uio auio; + struct iovec iov; + struct tarfs_node *tnp; + struct buf *bp; + off_t off; + size_t len; + int error; + + tnp = VP_TO_TARFS_NODE(ap->a_vp); + bp = ap->a_bp; + MPASS(bp->b_iocmd == BIO_READ); + MPASS(bp->b_iooffset >= 0); + MPASS(bp->b_bcount > 0); + MPASS(bp->b_bufsize >= bp->b_bcount); + TARFS_DPF(VNODE, "%s(%p=%s, %zu, %ld/%ld)\n", __func__, tnp, + tnp->name, (size_t)bp->b_iooffset, bp->b_bcount, bp->b_bufsize); + iov.iov_base = bp->b_data; + iov.iov_len = bp->b_bcount; + off = bp->b_iooffset; + len = bp->b_bcount; + bp->b_resid = len; + if (off > tnp->size) { + /* XXX read beyond EOF - figure out correct handling */ + error = EIO; + goto out; + } + if (off + len > tnp->size) { + /* clip to file length */ + len = tnp->size - off; + } + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_offset = off; + auio.uio_resid = len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = curthread; + error = tarfs_read_file(tnp, len, &auio); + bp->b_resid -= len - auio.uio_resid; +out: + if (error != 0) { + bp->b_ioflags |= BIO_ERROR; + bp->b_error = error; + } + bp->b_flags |= B_DONE; + return (0); +} + +static int +tarfs_vptofh(struct vop_vptofh_args *ap) +{ + struct tarfs_fid *tfp; + struct tarfs_node *tnp; + + tfp = (struct tarfs_fid *)ap->a_fhp; + tnp = VP_TO_TARFS_NODE(ap->a_vp); + + tfp->len = sizeof(struct tarfs_fid); + tfp->ino = tnp->ino; + tfp->gen = tnp->gen; + + return (0); +} + +struct vop_vector tarfs_vnodeops = { + .vop_default = &default_vnodeops, + + .vop_access = tarfs_access, + .vop_cachedlookup = tarfs_lookup, + .vop_close = tarfs_close, + .vop_getattr = tarfs_getattr, + .vop_lookup = vfs_cache_lookup, + .vop_open = tarfs_open, + .vop_print = tarfs_print, + .vop_read = tarfs_read, + .vop_readdir = tarfs_readdir, + .vop_readlink = tarfs_readlink, + .vop_reclaim = tarfs_reclaim, + .vop_strategy = tarfs_strategy, + .vop_vptofh = tarfs_vptofh, +}; +VFS_VOP_VECTOR_REGISTER(tarfs_vnodeops); diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index 33b1b506f85f..97f68c812a76 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -740,6 +740,12 @@ static struct witness_blessed blessed_list[] = { * parent directory vnode is locked. */ { "ufs", "bufwait" }, + + /* + * The tarfs decompression stream vnode may be locked while a + * buffer belonging to a tarfs data vnode is locked. + */ + { "tarfs", "bufwait" }, }; /* diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 8c39c357ec5a..61bbdb2341a1 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -369,6 +369,7 @@ SUBDIR= \ sym \ ${_syscons} \ sysvipc \ + tarfs \ tcp \ ${_ti} \ tmpfs \ diff --git a/sys/modules/tarfs/Makefile b/sys/modules/tarfs/Makefile new file mode 100644 index 000000000000..369f17b3f643 --- /dev/null +++ b/sys/modules/tarfs/Makefile @@ -0,0 +1,23 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR:H:H}/fs/tarfs + +KMOD= tarfs +SRCS= opt_tarfs.h \ + vnode_if.h \ + tarfs_io.c \ + tarfs_subr.c \ + tarfs_vnops.c \ + tarfs_vfsops.c + +.if !defined(KERNBUILDDIR) +CFLAGS+= -DZSTDIO +.ifdef TARFS_DEBUG +CFLAGS+= -DTARFS_DEBUG +.endif +.endif + +SRCS+= opt_zstdio.h +CFLAGS+= -I${SRCTOP}/sys/contrib/zstd/lib/freebsd + +.include <bsd.kmod.mk> diff --git a/tests/sys/fs/Makefile b/tests/sys/fs/Makefile index 6769f2182e79..88822c640d8a 100644 --- a/tests/sys/fs/Makefile +++ b/tests/sys/fs/Makefile @@ -14,6 +14,7 @@ TESTSRC= ${SRCTOP}/contrib/netbsd-tests/fs .if ${COMPILER_FEATURES:Mc++14} && ${MK_GOOGLETEST} != "no" TESTS_SUBDIRS+= fusefs .endif +TESTS_SUBDIRS+= tarfs TESTS_SUBDIRS+= tmpfs ${PACKAGE}FILES+= h_funcs.subr diff --git a/tests/sys/fs/tarfs/Makefile b/tests/sys/fs/tarfs/Makefile new file mode 100644 index 000000000000..b16c6544d33f --- /dev/null +++ b/tests/sys/fs/tarfs/Makefile @@ -0,0 +1,10 @@ +PACKAGE= tests + +TESTSDIR= ${TESTSBASE}/sys/fs/tarfs +BINDIR= ${TESTSDIR} + +PROGS+= mktar + +ATF_TESTS_SH+= tarfs_test + +.include <bsd.test.mk> diff --git a/tests/sys/fs/tarfs/mktar.c b/tests/sys/fs/tarfs/mktar.c new file mode 100644 index 000000000000..e1b1183af114 --- /dev/null +++ b/tests/sys/fs/tarfs/mktar.c @@ -0,0 +1,238 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/stat.h> +#include <sys/wait.h> + +#include <err.h> +#include <fcntl.h> +#include <paths.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#define PROGNAME "mktar" + +#define SUBDIRNAME "directory" +#define SPARSEFILENAME "sparse_file" +#define HARDLINKNAME "hard_link" +#define SHORTLINKNAME "short_link" +#define LONGLINKNAME "long_link" + +static bool opt_v; + +static void verbose(const char *fmt, ...) +{ + va_list ap; + + if (!opt_v) + return; + fprintf(stderr, "%s: ", PROGNAME); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fprintf(stderr, "\n"); +} + +static void +mksparsefile(const char *filename, mode_t mode) +{ + char buf[511]; + ssize_t res; + int fd; + + if ((fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, mode)) < 0) + err(1, "%s", filename); + for (unsigned int i = 33; i <= 126; i++) { + memset(buf, i, sizeof(buf)); + if (lseek(fd, 1048576LU * (i - 32), SEEK_SET) < 0) + err(1, "%s", filename); + res = write(fd, buf, sizeof(buf)); + if (res < 0) + err(1, "%s", filename); + if (res != sizeof(buf)) + errx(1, "%s: short write", filename); + } + close(fd); +} + +static char * +mklonglinktarget(const char *dirname, const char *filename) +{ + char *piece, *target; + + if (asprintf(&piece, "%1$s/../%1$s/../%1$s/../%1$s/../", dirname) < 0) + err(1, "asprintf()"); + if (asprintf(&target, "%1$s%1$s%1$s%1$s%1$s%1$s%1$s%1$s%2$s", piece, filename) < 0) + err(1, "asprintf()"); + free(piece); + return target; +} + +static void +mktar(void) +{ + char *linktarget; + + /* create a subdirectory */ + verbose("mkdir %s", SUBDIRNAME); + if (mkdir(SUBDIRNAME, 0755) != 0) + err(1, "%s", SUBDIRNAME); + + /* create a sparse file */ + verbose("creating %s", SPARSEFILENAME); + mksparsefile(SPARSEFILENAME, 0644); + chflags(SPARSEFILENAME, UF_NODUMP); + + /* create a hard link */ + verbose("link %s %s", SPARSEFILENAME, HARDLINKNAME); + if (link(SPARSEFILENAME, HARDLINKNAME) != 0) + err(1, "%s", HARDLINKNAME); + + /* create a symbolic link with a short target */ + verbose("symlink %s %s", SPARSEFILENAME, SHORTLINKNAME); + if (symlink(SPARSEFILENAME, SHORTLINKNAME) != 0) + err(1, "%s", SHORTLINKNAME); + + /* create a symbolic link with a long target */ + linktarget = mklonglinktarget(SUBDIRNAME, SPARSEFILENAME); + verbose("symlink %s %s", linktarget, LONGLINKNAME); + if (symlink(linktarget, LONGLINKNAME) != 0) + err(1, "%s", LONGLINKNAME); + free(linktarget); +} + +static void +usage(void) +{ + + fprintf(stderr, "usage: %s [-v] tarfile\n", PROGNAME); + exit(EXIT_FAILURE); +} + +int +main(int argc, char *argv[]) +{ + const char *tarfilename; + char *dirname; + int opt, wstatus; + pid_t pid; + + while ((opt = getopt(argc, argv, "v")) != -1) + switch (opt) { + case 'v': + opt_v = true; + break; + default: + usage(); + } + + argc -= optind; + argv += optind; + + if (argc != 1) + usage(); + tarfilename = *argv; + + if (asprintf(&dirname, "%s%s.XXXXXXXX", _PATH_TMP, PROGNAME) < 0) + err(1, "asprintf()"); + if (mkdtemp(dirname) == NULL) + err(1, "%s", dirname); + verbose("mkdir %s", dirname); + + /* fork a child to create the files */ + if ((pid = fork()) < 0) + err(1, "fork()"); + if (pid == 0) { + verbose("cd %s", dirname); + if (chdir(dirname) != 0) + err(1, "%s", dirname); + verbose("umask 022"); + umask(022); + mktar(); + verbose("cd -"); + exit(0); + } + if (waitpid(pid, &wstatus, 0) < 0) + err(1, "waitpid()"); + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) + errx(1, "child failed"); + + /* fork a child to create the tarball */ + if ((pid = fork()) < 0) + err(1, "fork()"); + if (pid == 0) { + verbose("creating tarball"); + execlp("tar", "tar", + "-c", + "-f", tarfilename, + "-C", dirname, + "--zstd", +#if 0 + "--options", "zstd:frame-per-file", +#endif + ".", + NULL); + err(1, "execlp()"); + } + if (waitpid(pid, &wstatus, 0) < 0) + err(1, "waitpid()"); + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) + errx(1, "child failed"); + + /* fork a child to delete everything */ + if ((pid = fork()) < 0) + err(1, "fork()"); + if (pid == 0) { + verbose("cd %s", dirname); + if (chdir(dirname) != 0) + err(1, "%s", dirname); + verbose("rm %s", LONGLINKNAME); + (void)unlink(LONGLINKNAME); + verbose("rm %s", SHORTLINKNAME); + (void)unlink(SHORTLINKNAME); + verbose("rm %s", HARDLINKNAME); + (void)unlink(HARDLINKNAME); + verbose("rm %s", SPARSEFILENAME); + (void)unlink(SPARSEFILENAME); + verbose("rm %s", SUBDIRNAME); + (void)rmdir(SUBDIRNAME); + verbose("cd -"); + exit(0); + } + if (waitpid(pid, &wstatus, 0) < 0) + err(1, "waitpid()"); + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) + errx(1, "child failed"); + verbose("rmdir %s", dirname); + (void)rmdir(dirname); + + exit(0); +} diff --git a/tests/sys/fs/tarfs/tarfs_test.sh b/tests/sys/fs/tarfs/tarfs_test.sh new file mode 100644 index 000000000000..d812ced80bbb --- /dev/null +++ b/tests/sys/fs/tarfs/tarfs_test.sh @@ -0,0 +1,54 @@ +#!/bin/sh +#- +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2023 Klara, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +mktar="$(dirname $(realpath "$0"))"/mktar +mnt="$(realpath ${TMPDIR:-/tmp})/mnt.$$" + +# expected SHA256 checksum of file contained in test tarball +sum=4da2143234486307bb44eaa610375301781a577d1172f362b88bb4b1643dee62 + +atf_test_case tarfs_test +tarfs_test_head() { + atf_set "require.user" "root" +} +tarfs_test_body() { + mkdir "${mnt}" + "${mktar}" tarfs_test.tar.zst + atf_check mount -rt tarfs tarfs_test.tar.zst "${mnt}" + atf_check_equal "$(stat -f%d,%i "${mnt}"/sparse_file)" "$(stat -f%d,%i "${mnt}"/hard_link)" + atf_check_equal "$(stat -f%d,%i "${mnt}"/sparse_file)" "$(stat -L -f%d,%i "${mnt}"/short_link)" + atf_check_equal "$(stat -f%d,%i "${mnt}"/sparse_file)" "$(stat -L -f%d,%i "${mnt}"/long_link)" + atf_check_equal "$(sha256 -q "${mnt}"/sparse_file)" ${sum} +} +tarfs_test_cleanup() { + umount "${mnt}" +} + +atf_init_test_cases() { + atf_add_test_case tarfs_test +} |