aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDag-Erling Smørgrav <des@FreeBSD.org>2023-02-02 17:18:41 +0000
committerDag-Erling Smørgrav <des@FreeBSD.org>2023-02-02 17:19:29 +0000
commit69d94f4c7608e41505996559367450706e91fbb8 (patch)
tree36c88cb17cfb46c370839c6068ef3c424c463df0
parentf29942229d24ebb8b98f8c5d02f3c8632648007e (diff)
downloadsrc-69d94f4c7608.tar.gz
src-69d94f4c7608.zip
Add tarfs, a filesystem backed by tarballs.
Sponsored by: Juniper Networks, Inc. Sponsored by: Klara, Inc. Reviewed by: pauamma, imp Differential Revision: https://reviews.freebsd.org/D37753
-rw-r--r--etc/mtree/BSD.tests.dist2
-rw-r--r--share/man/man5/Makefile1
-rw-r--r--share/man/man5/tarfs.5103
-rw-r--r--sys/conf/files4
-rw-r--r--sys/conf/options4
-rw-r--r--sys/fs/tarfs/tarfs.h254
-rw-r--r--sys/fs/tarfs/tarfs_dbg.h65
-rw-r--r--sys/fs/tarfs/tarfs_io.c727
-rw-r--r--sys/fs/tarfs/tarfs_subr.c603
-rw-r--r--sys/fs/tarfs/tarfs_vfsops.c1173
-rw-r--r--sys/fs/tarfs/tarfs_vnops.c642
-rw-r--r--sys/kern/subr_witness.c6
-rw-r--r--sys/modules/Makefile1
-rw-r--r--sys/modules/tarfs/Makefile23
-rw-r--r--tests/sys/fs/Makefile1
-rw-r--r--tests/sys/fs/tarfs/Makefile10
-rw-r--r--tests/sys/fs/tarfs/mktar.c238
-rw-r--r--tests/sys/fs/tarfs/tarfs_test.sh54
18 files changed, 3911 insertions, 0 deletions
diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist
index 0d05ecaf06fc..b4b18997b7f9 100644
--- a/etc/mtree/BSD.tests.dist
+++ b/etc/mtree/BSD.tests.dist
@@ -757,6 +757,8 @@
fs
fusefs
..
+ tarfs
+ ..
tmpfs
..
..
diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile
index 2d49d981c2f9..f6e91e4ed00b 100644
--- a/share/man/man5/Makefile
+++ b/share/man/man5/Makefile
@@ -70,6 +70,7 @@ MAN= acct.5 \
style.Makefile.5 \
style.mdoc.5 \
sysctl.conf.5 \
+ tarfs.5 \
tmpfs.5 \
unionfs.5
diff --git a/share/man/man5/tarfs.5 b/share/man/man5/tarfs.5
new file mode 100644
index 000000000000..b25131c323c1
--- /dev/null
+++ b/share/man/man5/tarfs.5
@@ -0,0 +1,103 @@
+.\"-
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2022 Klara, Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd February 2, 2023
+.Dt TARFS 5
+.Os
+.Sh NAME
+.Nm tarfs
+.Nd tarball filesystem
+.Sh SYNOPSIS
+To compile this driver into the kernel, place the following line in
+your kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "options TARFS"
+.Ed
+.Pp
+Alternatively, to load the driver as a module at boot time, place the
+following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+tarfs_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver implementes a read-only filesystem backed by a
+.Xr tar 5
+file.
+Currently, only POSIX archives, optionally compressed with
+.Xr zstd 1 ,
+are supported.
+.Pp
+The preferred I/O size for
+.Nm
+filesystems can be adjusted using the
+.Va vfs.tarfs.ioshift
+sysctl setting and tunable.
+Setting it to 0 will reset it to its default value.
+Note that changes to this setting only apply to filesystems mounted
+after the change.
+.Sh DIAGNOSTICS
+If enabled by the
+.Dv TARFS_DEBUG
+kernel option, the
+.Va vfs.tarfs.debug
+sysctl setting can be used to control debugging output from the
+.Nm
+driver.
+Debugging output for individual sections of the driver can be enabled
+by adding together the relevant values from the table below.
+.Bl -column Value Description
+.It 0x01 Ta Memory allocations
+.It 0x02 Ta Checksum calculations
+.It 0x04 Ta Filesystem operations (vfsops)
+.It 0x08 Ta Path lookups
+.It 0x10 Ta File operations (vnops)
+.It 0x20 Ta General I/O
+.It 0x40 Ta Decompression
+.It 0x80 Ta Decompression index
+.It 0x100 Ta Sparse file mapping
+.El
+.Sh SEE ALSO
+.Xr tar 1 ,
+.Xr zstd 1 ,
+.Xr fstab 5 ,
+.Xr tar 5 ,
+.Xr mount 8 ,
+.Xr sysctl 8
+.Sh HISTORY
+.An -nosplit
+The
+.Nm
+driver was developed by
+.An Stephen J. Kiernan Aq Mt stevek@FreeBSD.org
+and
+.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org
+for Juniper Networks and Klara Systems.
+This manual page was written by
+.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org
+for Juniper Networks and Klara Systems.
diff --git a/sys/conf/files b/sys/conf/files
index 6cb4abcd9223..08966a9b46e4 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3615,6 +3615,10 @@ fs/smbfs/smbfs_smb.c optional smbfs
fs/smbfs/smbfs_subr.c optional smbfs
fs/smbfs/smbfs_vfsops.c optional smbfs
fs/smbfs/smbfs_vnops.c optional smbfs
+fs/tarfs/tarfs_io.c optional tarfs compile-with "${NORMAL_C} -I$S/contrib/zstd/lib/freebsd"
+fs/tarfs/tarfs_subr.c optional tarfs
+fs/tarfs/tarfs_vfsops.c optional tarfs
+fs/tarfs/tarfs_vnops.c optional tarfs
fs/udf/osta.c optional udf
fs/udf/udf_iconv.c optional udf_iconv
fs/udf/udf_vfsops.c optional udf
diff --git a/sys/conf/options b/sys/conf/options
index 1f5003507539..3b2be66ba602 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -265,6 +265,7 @@ NULLFS opt_dontuse.h
PROCFS opt_dontuse.h
PSEUDOFS opt_dontuse.h
SMBFS opt_dontuse.h
+TARFS opt_dontuse.h
TMPFS opt_dontuse.h
UDF opt_dontuse.h
UNIONFS opt_dontuse.h
@@ -273,6 +274,9 @@ ZFS opt_dontuse.h
# Pseudofs debugging
PSEUDOFS_TRACE opt_pseudofs.h
+# Tarfs debugging
+TARFS_DEBUG opt_tarfs.h
+
# In-kernel GSS-API
KGSSAPI opt_kgssapi.h
KGSSAPI_DEBUG opt_kgssapi.h
diff --git a/sys/fs/tarfs/tarfs.h b/sys/fs/tarfs/tarfs.h
new file mode 100644
index 000000000000..dffd60ee6d8a
--- /dev/null
+++ b/sys/fs/tarfs/tarfs.h
@@ -0,0 +1,254 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _FS_TARFS_TARFS_H_
+#define _FS_TARFS_TARFS_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+MALLOC_DECLARE(M_TARFSMNT);
+MALLOC_DECLARE(M_TARFSNODE);
+MALLOC_DECLARE(M_TARFSNAME);
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_vfs_tarfs);
+#endif
+
+struct componentname;
+struct mount;
+struct vnode;
+
+/*
+ * Internal representation of a tarfs file system node.
+ */
+struct tarfs_node {
+ TAILQ_ENTRY(tarfs_node) entries;
+ TAILQ_ENTRY(tarfs_node) dirents;
+
+ struct mtx lock;
+
+ struct vnode *vnode;
+ struct tarfs_mount *tmp;
+ enum vtype type;
+ ino_t ino;
+ off_t offset;
+ size_t size;
+ size_t physize;
+ char *name;
+ size_t namelen;
+
+ /* Node attributes */
+ uid_t uid;
+ gid_t gid;
+ mode_t mode;
+ unsigned int flags;
+ nlink_t nlink;
+ struct timespec atime;
+ struct timespec mtime;
+ struct timespec ctime;
+ struct timespec birthtime;
+ unsigned long gen;
+
+ /* Block map */
+ size_t nblk;
+ struct tarfs_blk *blk;
+
+ struct tarfs_node *parent;
+ union {
+ /* VDIR */
+ struct {
+ TAILQ_HEAD(, tarfs_node) dirhead;
+ off_t lastcookie;
+ struct tarfs_node *lastnode;
+ } dir;
+
+ /* VLNK */
+ struct {
+ char *name;
+ size_t namelen;
+ } link;
+
+ /* VBLK or VCHR */
+ dev_t rdev;
+
+ /* VREG */
+ struct tarfs_node *other;
+ };
+};
+
+/*
+ * Entry in sparse file block map.
+ */
+struct tarfs_blk {
+ off_t i; /* input (physical) offset */
+ off_t o; /* output (logical) offset */
+ size_t l; /* length */
+};
+
+/*
+ * Decompression buffer.
+ */
+#define TARFS_ZBUF_SIZE 1048576
+struct tarfs_zbuf {
+ u_char buf[TARFS_ZBUF_SIZE];
+ size_t off; /* offset of contents */
+ size_t len; /* length of contents */
+};
+
+/*
+ * Internal representation of a tarfs mount point.
+ */
+struct tarfs_mount {
+ TAILQ_HEAD(, tarfs_node) allnodes;
+ struct mtx allnode_lock;
+
+ struct tarfs_node *root;
+ struct vnode *vp;
+ struct mount *vfs;
+ ino_t ino;
+ struct unrhdr *ino_unr;
+ size_t iosize;
+ size_t nblocks;
+ size_t nfiles;
+ time_t mtime; /* default mtime for directories */
+
+ struct tarfs_zio *zio;
+ struct vnode *znode;
+};
+
+struct tarfs_zio {
+ struct tarfs_mount *tmp;
+
+ /* decompression state */
+#ifdef ZSTDIO
+ struct tarfs_zstd *zstd; /* decompression state (zstd) */
+#endif
+ off_t ipos; /* current input position */
+ off_t opos; /* current output position */
+
+ /* index of compression frames */
+ unsigned int curidx; /* current index position*/
+ unsigned int nidx; /* number of index entries */
+ unsigned int szidx; /* index capacity */
+ struct tarfs_idx { off_t i, o; } *idx;
+};
+
+struct tarfs_fid {
+ u_short len; /* length of data in bytes */
+ u_short data0; /* force alignment */
+ ino_t ino;
+ unsigned long gen;
+};
+
+#define TARFS_NODE_LOCK(tnp) \
+ mtx_lock(&(tnp)->lock)
+#define TARFS_NODE_UNLOCK(tnp) \
+ mtx_unlock(&(tnp)->lock)
+#define TARFS_ALLNODES_LOCK(tnp) \
+ mtx_lock(&(tmp)->allnode_lock)
+#define TARFS_ALLNODES_UNLOCK(tnp) \
+ mtx_unlock(&(tmp)->allnode_lock)
+
+/*
+ * Data and metadata within tar files are aligned on 512-byte boundaries,
+ * to match the block size of the magnetic tapes they were originally
+ * intended for.
+ */
+#define TARFS_BSHIFT 9
+#define TARFS_BLOCKSIZE (size_t)(1U << TARFS_BSHIFT)
+#define TARFS_BLKOFF(l) ((l) % TARFS_BLOCKSIZE)
+#define TARFS_BLKNUM(l) ((l) >> TARFS_BSHIFT)
+#define TARFS_SZ2BLKS(sz) (((sz) + TARFS_BLOCKSIZE - 1) / TARFS_BLOCKSIZE)
+
+/*
+ * Our preferred I/O size.
+ */
+extern unsigned int tarfs_ioshift;
+#define TARFS_IOSHIFT_MIN TARFS_BSHIFT
+#define TARFS_IOSHIFT_DEFAULT PAGE_SHIFT
+#define TARFS_IOSHIFT_MAX PAGE_SHIFT
+
+#define TARFS_ROOTINO ((ino_t)3)
+#define TARFS_ZIOINO ((ino_t)4)
+#define TARFS_MININO ((ino_t)65535)
+
+#define TARFS_COOKIE_DOT 0
+#define TARFS_COOKIE_DOTDOT 1
+#define TARFS_COOKIE_EOF OFF_MAX
+
+#define TARFS_ZIO_NAME ".tar"
+#define TARFS_ZIO_NAMELEN (sizeof(TARFS_ZIO_NAME) - 1)
+
+extern struct vop_vector tarfs_vnodeops;
+
+static inline
+struct tarfs_mount *
+MP_TO_TARFS_MOUNT(struct mount *mp)
+{
+
+ MPASS(mp != NULL && mp->mnt_data != NULL);
+ return (mp->mnt_data);
+}
+
+static inline
+struct tarfs_node *
+VP_TO_TARFS_NODE(struct vnode *vp)
+{
+
+ MPASS(vp != NULL && vp->v_data != NULL);
+ return (vp->v_data);
+}
+
+int tarfs_alloc_node(struct tarfs_mount *tmp, const char *name,
+ size_t namelen, enum vtype type, off_t off, size_t sz,
+ time_t mtime, uid_t uid, gid_t gid, mode_t mode,
+ unsigned int flags, const char *linkname, dev_t rdev,
+ struct tarfs_node *parent, struct tarfs_node **node);
+int tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize);
+void tarfs_dump_tree(struct tarfs_node *tnp);
+void tarfs_free_node(struct tarfs_node *tnp);
+struct tarfs_node *
+ tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie);
+struct tarfs_node *
+ tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f,
+ struct componentname *cnp);
+void tarfs_print_node(struct tarfs_node *tnp);
+int tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop);
+
+int tarfs_io_init(struct tarfs_mount *tmp);
+int tarfs_io_fini(struct tarfs_mount *tmp);
+int tarfs_io_read(struct tarfs_mount *tmp, bool raw,
+ struct uio *uiop);
+ssize_t tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+ void *buf, off_t off, size_t len);
+unsigned int
+ tarfs_strtofflags(const char *str, char **end);
+
+#endif /* _FS_TARFS_TARFS_H_ */
diff --git a/sys/fs/tarfs/tarfs_dbg.h b/sys/fs/tarfs/tarfs_dbg.h
new file mode 100644
index 000000000000..45d11d679719
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_dbg.h
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _FS_TARFS_TARFS_DBG_H_
+#define _FS_TARFS_TARFS_DBG_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+#ifdef TARFS_DEBUG
+extern int tarfs_debug;
+
+#define TARFS_DEBUG_ALLOC 0x01
+#define TARFS_DEBUG_CHECKSUM 0x02
+#define TARFS_DEBUG_FS 0x04
+#define TARFS_DEBUG_LOOKUP 0x08
+#define TARFS_DEBUG_VNODE 0x10
+#define TARFS_DEBUG_IO 0x20
+#define TARFS_DEBUG_ZIO 0x40
+#define TARFS_DEBUG_ZIDX 0x80
+#define TARFS_DEBUG_MAP 0x100
+
+#define TARFS_DPF(category, fmt, ...) \
+ do { \
+ if ((tarfs_debug & TARFS_DEBUG_##category) != 0) \
+ printf(fmt, ## __VA_ARGS__); \
+ } while (0)
+#define TARFS_DPF_IFF(category, cond, fmt, ...) \
+ do { \
+ if ((cond) \
+ && (tarfs_debug & TARFS_DEBUG_##category) != 0) \
+ printf(fmt, ## __VA_ARGS__); \
+ } while (0)
+#else
+#define TARFS_DPF(category, fmt, ...)
+#define TARFS_DPF_IFF(category, cond, fmt, ...)
+#endif
+
+#endif /* _FS_TARFS_TARFS_DBG_H_ */
diff --git a/sys/fs/tarfs/tarfs_io.c b/sys/fs/tarfs/tarfs_io.c
new file mode 100644
index 000000000000..b957ac11ff51
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_io.c
@@ -0,0 +1,727 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+#include "opt_zstdio.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#ifdef ZSTDIO
+#define ZSTD_STATIC_LINKING_ONLY
+#include <contrib/zstd/lib/zstd.h>
+#endif
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+#ifdef TARFS_DEBUG
+SYSCTL_NODE(_vfs_tarfs, OID_AUTO, zio, CTLFLAG_RD, 0,
+ "Tar filesystem decompression layer");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_inflated);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, inflated, CTLFLAG_RD,
+ &tarfs_zio_inflated, "Amount of compressed data inflated.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_consumed);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, consumed, CTLFLAG_RD,
+ &tarfs_zio_consumed, "Amount of compressed data consumed.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_bounced);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, bounced, CTLFLAG_RD,
+ &tarfs_zio_bounced, "Amount of decompressed data bounced.");
+
+static int
+tarfs_sysctl_handle_zio_reset(SYSCTL_HANDLER_ARGS)
+{
+ unsigned int tmp;
+ int error;
+
+ tmp = 0;
+ if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0)
+ return (error);
+ if (req->newptr != NULL) {
+ if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0)
+ return (error);
+ counter_u64_zero(tarfs_zio_inflated);
+ counter_u64_zero(tarfs_zio_consumed);
+ counter_u64_zero(tarfs_zio_bounced);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_tarfs_zio, OID_AUTO, reset,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW,
+ NULL, 0, tarfs_sysctl_handle_zio_reset, "IU",
+ "Reset compression counters.");
+#endif
+
+MALLOC_DEFINE(M_TARFSZSTATE, "tarfs zstate", "tarfs decompression state");
+MALLOC_DEFINE(M_TARFSZBUF, "tarfs zbuf", "tarfs decompression buffers");
+
+#define XZ_MAGIC (uint8_t[]){ 0xfd, 0x37, 0x7a, 0x58, 0x5a }
+#define ZLIB_MAGIC (uint8_t[]){ 0x1f, 0x8b, 0x08 }
+#define ZSTD_MAGIC (uint8_t[]){ 0x28, 0xb5, 0x2f, 0xfd }
+
+#ifdef ZSTDIO
+struct tarfs_zstd {
+ ZSTD_DStream *zds;
+};
+#endif
+
+/* XXX review use of curthread / uio_td / td_cred */
+
+/*
+ * Reads from the tar file according to the provided uio. If the archive
+ * is compressed and raw is false, reads the decompressed stream;
+ * otherwise, reads directly from the original file. Returns 0 on success
+ * and a positive errno value on failure.
+ */
+int
+tarfs_io_read(struct tarfs_mount *tmp, bool raw, struct uio *uiop)
+{
+ void *rl = NULL;
+ off_t off = uiop->uio_offset;
+ size_t len = uiop->uio_resid;
+ int error;
+
+ if (raw || tmp->znode == NULL) {
+ rl = vn_rangelock_rlock(tmp->vp, off, off + len);
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error == 0) {
+ error = VOP_READ(tmp->vp, uiop,
+ IO_DIRECT|IO_NODELOCKED,
+ uiop->uio_td->td_ucred);
+ VOP_UNLOCK(tmp->vp);
+ }
+ vn_rangelock_unlock(tmp->vp, rl);
+ } else {
+ error = vn_lock(tmp->znode, LK_EXCLUSIVE);
+ if (error == 0) {
+ error = VOP_READ(tmp->znode, uiop,
+ IO_DIRECT | IO_NODELOCKED,
+ uiop->uio_td->td_ucred);
+ VOP_UNLOCK(tmp->znode);
+ }
+ }
+ TARFS_DPF(IO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
+ (size_t)off, len, error, uiop->uio_resid);
+ return (error);
+}
+
+/*
+ * Reads from the tar file into the provided buffer. If the archive is
+ * compressed and raw is false, reads the decompressed stream; otherwise,
+ * reads directly from the original file. Returns the number of bytes
+ * read on success, 0 on EOF, and a negative errno value on failure.
+ */
+ssize_t
+tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+ void *buf, off_t off, size_t len)
+{
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t res;
+ int error;
+
+ if (len == 0) {
+ TARFS_DPF(IO, "%s(%zu, %zu) null\n", __func__,
+ (size_t)off, len);
+ return (0);
+ }
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = off;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_resid = len;
+ auio.uio_td = curthread;
+ error = tarfs_io_read(tmp, raw, &auio);
+ if (error != 0) {
+ TARFS_DPF(IO, "%s(%zu, %zu) error %d\n", __func__,
+ (size_t)off, len, error);
+ return (-error);
+ }
+ res = len - auio.uio_resid;
+ if (res == 0 && len != 0) {
+ TARFS_DPF(IO, "%s(%zu, %zu) eof\n", __func__,
+ (size_t)off, len);
+ } else {
+ TARFS_DPF(IO, "%s(%zu, %zu) read %zd | %*D\n", __func__,
+ (size_t)off, len, res,
+ (int)(res > 8 ? 8 : res), (uint8_t *)buf, " ");
+ }
+ return (res);
+}
+
+#ifdef ZSTDIO
+static void *
+tarfs_zstate_alloc(void *opaque, size_t size)
+{
+
+ (void)opaque;
+ return (malloc(size, M_TARFSZSTATE, M_WAITOK));
+}
+#endif
+
+#ifdef ZSTDIO
+static void
+tarfs_zstate_free(void *opaque, void *address)
+{
+
+ (void)opaque;
+ free(address, M_TARFSZSTATE);
+}
+#endif
+
+#ifdef ZSTDIO
+static ZSTD_customMem tarfs_zstd_mem = {
+ tarfs_zstate_alloc,
+ tarfs_zstate_free,
+ NULL,
+};
+#endif
+
+/*
+ * Updates the decompression frame index, recording the current input and
+ * output offsets in a new index entry, and growing the index if
+ * necessary.
+ */
+static void
+tarfs_zio_update_index(struct tarfs_zio *zio, off_t i, off_t o)
+{
+
+ if (++zio->curidx >= zio->nidx) {
+ if (++zio->nidx > zio->szidx) {
+ zio->szidx *= 2;
+ zio->idx = realloc(zio->idx,
+ zio->szidx * sizeof(*zio->idx),
+ M_TARFSZSTATE, M_ZERO | M_WAITOK);
+ TARFS_DPF(ALLOC, "%s: resized zio index\n", __func__);
+ }
+ zio->idx[zio->curidx].i = i;
+ zio->idx[zio->curidx].o = o;
+ TARFS_DPF(ZIDX, "%s: index %u = i %zu o %zu\n", __func__,
+ zio->curidx, (size_t)zio->idx[zio->curidx].i,
+ (size_t)zio->idx[zio->curidx].o);
+ }
+ MPASS(zio->idx[zio->curidx].i == i);
+ MPASS(zio->idx[zio->curidx].o == o);
+}
+
+/*
+ * VOP_ACCESS for zio node.
+ */
+static int
+tarfs_zaccess(struct vop_access_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct tarfs_zio *zio = vp->v_data;
+ struct tarfs_mount *tmp = zio->tmp;
+ accmode_t accmode = ap->a_accmode;
+ int error = EPERM;
+
+ if (accmode == VREAD) {
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error == 0) {
+ error = VOP_ACCESS(tmp->vp, accmode, ap->a_cred, ap->a_td);
+ VOP_UNLOCK(tmp->vp);
+ }
+ }
+ TARFS_DPF(ZIO, "%s(%d) = %d\n", __func__, accmode, error);
+ return (error);
+}
+
+/*
+ * VOP_GETATTR for zio node.
+ */
+static int
+tarfs_zgetattr(struct vop_getattr_args *ap)
+{
+ struct vattr va;
+ struct vnode *vp = ap->a_vp;
+ struct tarfs_zio *zio = vp->v_data;
+ struct tarfs_mount *tmp = zio->tmp;
+ struct vattr *vap = ap->a_vap;
+ int error = 0;
+
+ VATTR_NULL(vap);
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error == 0) {
+ error = VOP_GETATTR(tmp->vp, &va, ap->a_cred);
+ VOP_UNLOCK(tmp->vp);
+ if (error == 0) {
+ vap->va_type = VREG;
+ vap->va_mode = va.va_mode;
+ vap->va_nlink = 1;
+ vap->va_gid = va.va_gid;
+ vap->va_uid = va.va_uid;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_fileid = TARFS_ZIOINO;
+ vap->va_size = zio->idx[zio->nidx - 1].o;
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+ vap->va_atime = va.va_atime;
+ vap->va_ctime = va.va_ctime;
+ vap->va_mtime = va.va_mtime;
+ vap->va_birthtime = tmp->root->birthtime;
+ vap->va_bytes = va.va_bytes;
+ }
+ }
+ TARFS_DPF(ZIO, "%s() = %d\n", __func__, error);
+ return (error);
+}
+
+#ifdef ZSTDIO
+/*
+ * VOP_READ for zio node, zstd edition.
+ */
+static int
+tarfs_zread_zstd(struct tarfs_zio *zio, struct uio *uiop)
+{
+ void *ibuf = NULL, *obuf = NULL, *rl = NULL;
+ struct uio auio;
+ struct iovec aiov;
+ struct tarfs_mount *tmp = zio->tmp;
+ struct tarfs_zstd *zstd = zio->zstd;
+ struct thread *td = curthread;
+ ZSTD_inBuffer zib;
+ ZSTD_outBuffer zob;
+ off_t zsize;
+ off_t ipos, opos;
+ size_t ilen, olen;
+ size_t zerror;
+ off_t off = uiop->uio_offset;
+ size_t len = uiop->uio_resid;
+ size_t resid = uiop->uio_resid;
+ size_t bsize;
+ int error;
+ bool reset = false;
+
+ /* do we have to rewind? */
+ if (off < zio->opos) {
+ while (zio->curidx > 0 && off < zio->idx[zio->curidx].o)
+ zio->curidx--;
+ reset = true;
+ }
+ /* advance to the nearest index entry */
+ if (off > zio->opos) {
+ // XXX maybe do a binary search instead
+ while (zio->curidx < zio->nidx - 1 &&
+ off >= zio->idx[zio->curidx + 1].o) {
+ zio->curidx++;
+ reset = true;
+ }
+ }
+ /* reset the decompression stream if needed */
+ if (reset) {
+ zio->ipos = zio->idx[zio->curidx].i;
+ zio->opos = zio->idx[zio->curidx].o;
+ ZSTD_resetDStream(zstd->zds);
+ TARFS_DPF(ZIDX, "%s: skipping to index %u = i %zu o %zu\n", __func__,
+ zio->curidx, (size_t)zio->ipos, (size_t)zio->opos);
+ } else {
+ TARFS_DPF(ZIDX, "%s: continuing at i %zu o %zu\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos);
+ }
+
+ /*
+ * Set up a temporary buffer for compressed data. Use the size
+ * recommended by the zstd library; this is usually 128 kB, but
+ * just in case, make sure it's a multiple of the page size and no
+ * larger than MAXBSIZE.
+ */
+ bsize = roundup(ZSTD_CStreamOutSize(), PAGE_SIZE);
+ if (bsize > MAXBSIZE)
+ bsize = MAXBSIZE;
+ ibuf = malloc(bsize, M_TEMP, M_WAITOK);
+ zib.src = NULL;
+ zib.size = 0;
+ zib.pos = 0;
+
+ /*
+ * Set up the decompression buffer. If the target is not in
+ * kernel space, we will have to set up a bounce buffer.
+ *
+ * TODO: to avoid using a bounce buffer, map destination pages
+ * using vm_fault_quick_hold_pages().
+ */
+ MPASS(zio->opos <= off);
+ MPASS(uiop->uio_iovcnt == 1);
+ MPASS(uiop->uio_iov->iov_len >= len);
+ if (uiop->uio_segflg == UIO_SYSSPACE) {
+ zob.dst = uiop->uio_iov->iov_base;
+ } else {
+ TARFS_DPF(ALLOC, "%s: allocating %zu-byte bounce buffer\n",
+ __func__, len);
+ zob.dst = obuf = malloc(len, M_TEMP, M_WAITOK);
+ }
+ zob.size = len;
+ zob.pos = 0;
+
+ /* lock tarball */
+ rl = vn_rangelock_rlock(tmp->vp, zio->ipos, OFF_MAX);
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error != 0) {
+ goto fail_unlocked;
+ }
+ /* check size */
+ error = vn_getsize_locked(tmp->vp, &zsize, td->td_ucred);
+ if (error != 0) {
+ goto fail;
+ }
+ if (zio->ipos >= zsize) {
+ /* beyond EOF */
+ goto fail;
+ }
+
+ while (resid > 0) {
+ if (zib.pos == zib.size) {
+ /* request data from the underlying file */
+ aiov.iov_base = ibuf;
+ aiov.iov_len = bsize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = zio->ipos;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_resid = aiov.iov_len;
+ auio.uio_td = td;
+ error = VOP_READ(tmp->vp, &auio,
+ IO_DIRECT | IO_NODELOCKED,
+ td->td_ucred);
+ if (error != 0)
+ goto fail;
+ TARFS_DPF(ZIO, "%s: req %zu+%zu got %zu+%zu\n", __func__,
+ (size_t)zio->ipos, bsize,
+ (size_t)zio->ipos, bsize - auio.uio_resid);
+ zib.src = ibuf;
+ zib.size = bsize - auio.uio_resid;
+ zib.pos = 0;
+ }
+ MPASS(zib.pos <= zib.size);
+ if (zib.pos == zib.size) {
+ TARFS_DPF(ZIO, "%s: end of file after i %zu o %zu\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos);
+ goto fail;
+ }
+ if (zio->opos < off) {
+ /* to be discarded */
+ zob.size = min(off - zio->opos, len);
+ zob.pos = 0;
+ } else {
+ zob.size = len;
+ zob.pos = zio->opos - off;
+ }
+ ipos = zib.pos;
+ opos = zob.pos;
+ /* decompress as much as possible */
+ zerror = ZSTD_decompressStream(zstd->zds, &zob, &zib);
+ zio->ipos += ilen = zib.pos - ipos;
+ zio->opos += olen = zob.pos - opos;
+ if (zio->opos > off)
+ resid -= olen;
+ if (ZSTD_isError(zerror)) {
+ TARFS_DPF(ZIO, "%s: inflate failed after i %zu o %zu: %s\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos, ZSTD_getErrorName(zerror));
+ error = EIO;
+ goto fail;
+ }
+ if (zerror == 0 && olen == 0) {
+ TARFS_DPF(ZIO, "%s: end of stream after i %zu o %zu\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos);
+ break;
+ }
+ if (zerror == 0) {
+ TARFS_DPF(ZIO, "%s: end of frame after i %zu o %zu\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos);
+ tarfs_zio_update_index(zio, zio->ipos, zio->opos);
+ }
+ TARFS_DPF(ZIO, "%s: inflated %zu\n", __func__, olen);
+#ifdef TARFS_DEBUG
+ counter_u64_add(tarfs_zio_inflated, olen);
+#endif
+ }
+fail:
+ VOP_UNLOCK(tmp->vp);
+fail_unlocked:
+ if (error == 0) {
+ if (uiop->uio_segflg == UIO_SYSSPACE) {
+ uiop->uio_resid = resid;
+ } else if (len > resid) {
+ TARFS_DPF(ALLOC, "%s: bounced %zu bytes\n", __func__,
+ len - resid);
+ error = uiomove(obuf, len - resid, uiop);
+#ifdef TARFS_DEBUG
+ counter_u64_add(tarfs_zio_bounced, len - resid);
+#endif
+ }
+ }
+ if (obuf != NULL) {
+ TARFS_DPF(ALLOC, "%s: freeing bounce buffer\n", __func__);
+ free(obuf, M_TEMP);
+ }
+ if (rl != NULL)
+ vn_rangelock_unlock(tmp->vp, rl);
+ if (ibuf != NULL)
+ free(ibuf, M_TEMP);
+ TARFS_DPF(ZIO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
+ (size_t)off, len, error, uiop->uio_resid);
+#ifdef TARFS_DEBUG
+ counter_u64_add(tarfs_zio_consumed, len - uiop->uio_resid);
+#endif
+ if (error != 0) {
+ zio->curidx = 0;
+ zio->ipos = zio->idx[0].i;
+ zio->opos = zio->idx[0].o;
+ ZSTD_resetDStream(zstd->zds);
+ }
+ return (error);
+}
+#endif
+
+/*
+ * VOP_READ for zio node.
+ */
+static int
+tarfs_zread(struct vop_read_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct tarfs_zio *zio = vp->v_data;
+ struct uio *uiop = ap->a_uio;
+#ifdef TARFS_DEBUG
+ off_t off = uiop->uio_offset;
+ size_t len = uiop->uio_resid;
+#endif
+ int error;
+
+ TARFS_DPF(ZIO, "%s(%zu, %zu)\n", __func__,
+ (size_t)off, len);
+#ifdef ZSTDIO
+ if (zio->zstd != NULL) {
+ error = tarfs_zread_zstd(zio, uiop);
+ } else
+#endif
+ error = EFTYPE;
+ TARFS_DPF(ZIO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
+ (size_t)off, len, error, uiop->uio_resid);
+ return (error);
+}
+
+/*
+ * VOP_RECLAIM for zio node.
+ */
+static int
+tarfs_zreclaim(struct vop_reclaim_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+
+ TARFS_DPF(ZIO, "%s(%p)\n", __func__, vp);
+ vp->v_data = NULL;
+ vnode_destroy_vobject(vp);
+ cache_purge(vp);
+ return (0);
+}
+
+/*
+ * VOP_STRATEGY for zio node.
+ */
+static int
+tarfs_zstrategy(struct vop_strategy_args *ap)
+{
+ struct uio auio;
+ struct iovec iov;
+ struct vnode *vp = ap->a_vp;
+ struct buf *bp = ap->a_bp;
+ off_t off;
+ size_t len;
+ int error;
+
+ iov.iov_base = bp->b_data;
+ iov.iov_len = bp->b_bcount;
+ off = bp->b_iooffset;
+ len = bp->b_bcount;
+ bp->b_resid = len;
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = off;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = curthread;
+ error = VOP_READ(vp, &auio, IO_DIRECT | IO_NODELOCKED, bp->b_rcred);
+ bp->b_flags |= B_DONE;
+ if (error != 0) {
+ bp->b_ioflags |= BIO_ERROR;
+ bp->b_error = error;
+ }
+ return (0);
+}
+
+static struct vop_vector tarfs_znodeops = {
+ .vop_default = &default_vnodeops,
+
+ .vop_access = tarfs_zaccess,
+ .vop_getattr = tarfs_zgetattr,
+ .vop_read = tarfs_zread,
+ .vop_reclaim = tarfs_zreclaim,
+ .vop_strategy = tarfs_zstrategy,
+};
+VFS_VOP_VECTOR_REGISTER(tarfs_znodeops);
+
+/*
+ * Initializes the decompression layer.
+ */
+static struct tarfs_zio *
+tarfs_zio_init(struct tarfs_mount *tmp, off_t i, off_t o)
+{
+ struct tarfs_zio *zio;
+ struct vnode *zvp;
+
+ zio = malloc(sizeof(*zio), M_TARFSZSTATE, M_ZERO | M_WAITOK);
+ TARFS_DPF(ALLOC, "%s: allocated zio\n", __func__);
+ zio->tmp = tmp;
+ zio->szidx = 128;
+ zio->idx = malloc(zio->szidx * sizeof(*zio->idx), M_TARFSZSTATE,
+ M_ZERO | M_WAITOK);
+ zio->curidx = 0;
+ zio->nidx = 1;
+ zio->idx[zio->curidx].i = zio->ipos = i;
+ zio->idx[zio->curidx].o = zio->opos = o;
+ tmp->zio = zio;
+ TARFS_DPF(ALLOC, "%s: allocated zio index\n", __func__);
+ getnewvnode("tarfsz", tmp->vfs, &tarfs_znodeops, &zvp);
+ zvp->v_data = zio;
+ zvp->v_type = VREG;
+ zvp->v_mount = tmp->vfs;
+ vn_set_state(zvp, VSTATE_CONSTRUCTED);
+ tmp->znode = zvp;
+ TARFS_DPF(ZIO, "%s: created zio node\n", __func__);
+ return (zio);
+}
+
+/*
+ * Initializes the I/O layer, including decompression if the signature of
+ * a supported compression format is detected. Returns 0 on success and a
+ * positive errno value on failure.
+ */
+int
+tarfs_io_init(struct tarfs_mount *tmp)
+{
+ uint8_t *block;
+ struct tarfs_zio *zio = NULL;
+ ssize_t res;
+ int error = 0;
+
+ block = malloc(tmp->iosize, M_TEMP, M_ZERO | M_WAITOK);
+ res = tarfs_io_read_buf(tmp, true, block, 0, tmp->iosize);
+ if (res < 0) {
+ return (-res);
+ }
+ if (memcmp(block, XZ_MAGIC, sizeof(XZ_MAGIC)) == 0) {
+ printf("xz compression not supported\n");
+ error = EOPNOTSUPP;
+ goto bad;
+ } else if (memcmp(block, ZLIB_MAGIC, sizeof(ZLIB_MAGIC)) == 0) {
+ printf("zlib compression not supported\n");
+ error = EOPNOTSUPP;
+ goto bad;
+ } else if (memcmp(block, ZSTD_MAGIC, sizeof(ZSTD_MAGIC)) == 0) {
+#ifdef ZSTDIO
+ zio = tarfs_zio_init(tmp, 0, 0);
+ zio->zstd = malloc(sizeof(*zio->zstd), M_TARFSZSTATE, M_WAITOK);
+ zio->zstd->zds = ZSTD_createDStream_advanced(tarfs_zstd_mem);
+ (void)ZSTD_initDStream(zio->zstd->zds);
+#else
+ printf("zstd compression not supported\n");
+ error = EOPNOTSUPP;
+ goto bad;
+#endif
+ }
+bad:
+ free(block, M_TEMP);
+ return (error);
+}
+
+/*
+ * Tears down the decompression layer.
+ */
+static int
+tarfs_zio_fini(struct tarfs_mount *tmp)
+{
+ struct tarfs_zio *zio = tmp->zio;
+ int error = 0;
+
+ if (tmp->znode != NULL) {
+ error = vn_lock(tmp->znode, LK_EXCLUSIVE);
+ if (error != 0) {
+ TARFS_DPF(ALLOC, "%s: failed to lock znode", __func__);
+ return (error);
+ }
+ tmp->znode->v_mount = NULL;
+ vgone(tmp->znode);
+ vput(tmp->znode);
+ tmp->znode = NULL;
+ }
+#ifdef ZSTDIO
+ if (zio->zstd != NULL) {
+ TARFS_DPF(ALLOC, "%s: freeing zstd state\n", __func__);
+ ZSTD_freeDStream(zio->zstd->zds);
+ free(zio->zstd, M_TARFSZSTATE);
+ }
+#endif
+ if (zio->idx != NULL) {
+ TARFS_DPF(ALLOC, "%s: freeing index\n", __func__);
+ free(zio->idx, M_TARFSZSTATE);
+ }
+ TARFS_DPF(ALLOC, "%s: freeing zio\n", __func__);
+ free(zio, M_TARFSZSTATE);
+ tmp->zio = NULL;
+ return (error);
+}
+
+/*
+ * Tears down the I/O layer, including the decompression layer if
+ * applicable.
+ */
+int
+tarfs_io_fini(struct tarfs_mount *tmp)
+{
+ int error = 0;
+
+ if (tmp->zio != NULL) {
+ error = tarfs_zio_fini(tmp);
+ }
+ return (error);
+}
diff --git a/sys/fs/tarfs/tarfs_subr.c b/sys/fs/tarfs/tarfs_subr.c
new file mode 100644
index 000000000000..d4bd4e702e08
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_subr.c
@@ -0,0 +1,603 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/fcntl.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm_param.h>
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+MALLOC_DEFINE(M_TARFSNAME, "tarfs name", "tarfs file names");
+MALLOC_DEFINE(M_TARFSBLK, "tarfs blk", "tarfs block maps");
+
+SYSCTL_NODE(_vfs, OID_AUTO, tarfs, CTLFLAG_RW, 0, "Tar filesystem");
+
+unsigned int tarfs_ioshift = TARFS_IOSHIFT_DEFAULT;
+
+static int
+tarfs_sysctl_handle_ioshift(SYSCTL_HANDLER_ARGS)
+{
+ unsigned int tmp;
+ int error;
+
+ tmp = *(unsigned int *)arg1;
+ if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0)
+ return (error);
+ if (req->newptr != NULL) {
+ if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0)
+ return (error);
+ if (tmp == 0)
+ tmp = TARFS_IOSHIFT_DEFAULT;
+ if (tmp < TARFS_IOSHIFT_MIN)
+ tmp = TARFS_IOSHIFT_MIN;
+ if (tmp > TARFS_IOSHIFT_MAX)
+ tmp = TARFS_IOSHIFT_MAX;
+ *(unsigned int *)arg1 = tmp;
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_tarfs, OID_AUTO, ioshift,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW | CTLFLAG_TUN,
+ &tarfs_ioshift, 0, tarfs_sysctl_handle_ioshift, "IU",
+ "Tar filesystem preferred I/O size (log 2)");
+
+#ifdef TARFS_DEBUG
+int tarfs_debug;
+SYSCTL_INT(_vfs_tarfs, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_TUN,
+ &tarfs_debug, 0, "Tar filesystem debug mask");
+#endif /* TARFS_DEBUG */
+
+static void
+tarfs_dump_tree_internal(struct tarfs_node *tnp, int indent)
+{
+ struct tarfs_node *current;
+ const char *name;
+
+ if (tnp->type != VDIR)
+ return;
+
+ TAILQ_FOREACH(current, &tnp->dir.dirhead, dirents) {
+ if (current->name == NULL)
+ name = "<<root>>";
+ else
+ name = current->name;
+ printf("%*s%s\n", indent * 4, "", name);
+ if (current->type == VDIR)
+ tarfs_dump_tree_internal(current, indent + 1);
+ }
+}
+
+void
+tarfs_dump_tree(struct tarfs_node *tnp)
+{
+ const char *name;
+
+ if (tnp == NULL)
+ return;
+
+ if (tnp->name == NULL)
+ name = "<<root>>";
+ else
+ name = tnp->name;
+ printf("%s\n", name);
+
+ tarfs_dump_tree_internal(tnp, 1);
+}
+
+void
+tarfs_print_node(struct tarfs_node *tnp)
+{
+
+ if (tnp == NULL)
+ return;
+
+ printf("%s: node %p\n", __func__, tnp);
+ printf("\tvnode %p\n", tnp->vnode);
+ printf("\ttmp %p\n", tnp->tmp);
+ printf("\ttype %d\n", tnp->type);
+ printf("\tino %lu\n", tnp->ino);
+ printf("\tsize %zu\n", tnp->size);
+ printf("\tname %s\n",
+ (tnp->name == NULL) ? "<<root>>" : tnp->name);
+ printf("\tnamelen %zu\n", tnp->namelen);
+ printf("\tuid %d\n", tnp->uid);
+ printf("\tgid %d\n", tnp->gid);
+ printf("\tmode o%o\n", tnp->mode);
+ printf("\tflags %u\n", tnp->flags);
+ printf("\tnlink %lu\n", tnp->nlink);
+ printf("\tatime %d\n", (int)tnp->atime.tv_sec);
+ printf("\tmtime %d\n", (int)tnp->mtime.tv_sec);
+ printf("\tctime %d\n", (int)tnp->ctime.tv_sec);
+ printf("\tbirthtime %d\n", (int)tnp->birthtime.tv_sec);
+ printf("\tgen %lu\n", tnp->gen);
+ printf("\tparent %p\n", tnp->parent);
+
+ switch (tnp->type) {
+ case VDIR:
+ printf("\tdir.lastcookie %jd\n",
+ tnp->dir.lastcookie);
+ printf("\tdir.lastnode %p\n", tnp->dir.lastnode);
+ break;
+ case VBLK:
+ case VCHR:
+ printf("\trdev %lu\n", tnp->rdev);
+ break;
+ default:
+ break;
+ }
+}
+
+struct tarfs_node *
+tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f,
+ struct componentname *cnp)
+{
+ boolean_t found;
+ struct tarfs_node *entry;
+
+ TARFS_DPF(LOOKUP, "%s: name: %.*s\n", __func__, (int)cnp->cn_namelen,
+ cnp->cn_nameptr);
+
+ found = false;
+ TAILQ_FOREACH(entry, &tnp->dir.dirhead, dirents) {
+ if (f != NULL && entry != f)
+ continue;
+
+ if (entry->namelen == cnp->cn_namelen &&
+ bcmp(entry->name, cnp->cn_nameptr,
+ entry->namelen) == 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ if (entry->type == VREG && entry->other != NULL) {
+ TARFS_DPF_IFF(LOOKUP, "%s: following hard link %p\n",
+ __func__, entry);
+ entry = entry->other;
+ }
+ TARFS_DPF(LOOKUP, "%s: found tarfs_node %p\n", __func__,
+ entry);
+ return (entry);
+ }
+
+ TARFS_DPF(LOOKUP, "%s: no match found\n", __func__);
+ return (NULL);
+}
+
+struct tarfs_node *
+tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie)
+{
+ struct tarfs_node *current;
+
+ TARFS_DPF(LOOKUP, "%s: tarfs_node %p, cookie %jd\n", __func__, tnp,
+ cookie);
+ TARFS_DPF(LOOKUP, "%s: name: %s\n", __func__,
+ (tnp->name == NULL) ? "<<root>>" : tnp->name);
+
+ if (cookie == tnp->dir.lastcookie &&
+ tnp->dir.lastnode != NULL) {
+ TARFS_DPF(LOOKUP, "%s: Using cached entry: tarfs_node %p, "
+ "cookie %jd\n", __func__, tnp->dir.lastnode,
+ tnp->dir.lastcookie);
+ return (tnp->dir.lastnode);
+ }
+
+ TAILQ_FOREACH(current, &tnp->dir.dirhead, dirents) {
+ TARFS_DPF(LOOKUP, "%s: tarfs_node %p, current %p, ino %lu\n",
+ __func__, tnp, current, current->ino);
+ TARFS_DPF_IFF(LOOKUP, current->name != NULL,
+ "%s: name: %s\n", __func__, current->name);
+ if (current->ino == cookie) {
+ TARFS_DPF(LOOKUP, "%s: Found entry: tarfs_node %p, "
+ "cookie %lu\n", __func__, current,
+ current->ino);
+ break;
+ }
+ }
+
+ return (current);
+}
+
+int
+tarfs_alloc_node(struct tarfs_mount *tmp, const char *name, size_t namelen,
+ enum vtype type, off_t off, size_t sz, time_t mtime, uid_t uid, gid_t gid,
+ mode_t mode, unsigned int flags, const char *linkname, dev_t rdev,
+ struct tarfs_node *parent, struct tarfs_node **retnode)
+{
+ struct tarfs_node *tnp;
+
+ TARFS_DPF(ALLOC, "%s(%.*s)\n", __func__, (int)namelen, name);
+
+ tnp = malloc(sizeof(struct tarfs_node), M_TARFSNODE, M_WAITOK | M_ZERO);
+ mtx_init(&tnp->lock, "tarfs node lock", NULL, MTX_DEF);
+ tnp->gen = arc4random();
+ tnp->tmp = tmp;
+ if (namelen > 0) {
+ tnp->name = malloc(namelen + 1, M_TARFSNAME, M_WAITOK);
+ tnp->namelen = namelen;
+ memcpy(tnp->name, name, namelen);
+ tnp->name[namelen] = '\0';
+ }
+ tnp->type = type;
+ tnp->uid = uid;
+ tnp->gid = gid;
+ tnp->mode = mode;
+ tnp->nlink = 1;
+ vfs_timestamp(&tnp->atime);
+ tnp->mtime.tv_sec = mtime;
+ tnp->birthtime = tnp->atime;
+ tnp->ctime = tnp->mtime;
+ if (parent != NULL) {
+ tnp->ino = alloc_unr(tmp->ino_unr);
+ }
+ tnp->offset = off;
+ tnp->size = tnp->physize = sz;
+ switch (type) {
+ case VDIR:
+ MPASS(parent != tnp);
+ MPASS(parent != NULL || tmp->root == NULL);
+ TAILQ_INIT(&tnp->dir.dirhead);
+ tnp->nlink++;
+ if (parent == NULL) {
+ tnp->ino = TARFS_ROOTINO;
+ }
+ tnp->physize = 0;
+ break;
+ case VLNK:
+ tnp->link.name = malloc(sz + 1, M_TARFSNAME,
+ M_WAITOK);
+ tnp->link.namelen = sz;
+ memcpy(tnp->link.name, linkname, sz);
+ tnp->link.name[sz] = '\0';
+ break;
+ case VREG:
+ /* create dummy block map */
+ tnp->nblk = 1;
+ tnp->blk = malloc(sizeof(*tnp->blk), M_TARFSBLK, M_WAITOK);
+ tnp->blk[0].i = 0;
+ tnp->blk[0].o = 0;
+ tnp->blk[0].l = tnp->physize;
+ break;
+ case VFIFO:
+ /* Nothing extra to do */
+ break;
+ case VBLK:
+ case VCHR:
+ tnp->rdev = rdev;
+ tnp->physize = 0;
+ break;
+ default:
+ panic("%s: type %d not allowed", __func__, type);
+ }
+ if (parent != NULL) {
+ MPASS(parent->type == VDIR);
+ TARFS_NODE_LOCK(parent);
+ TAILQ_INSERT_TAIL(&parent->dir.dirhead, tnp, dirents);
+ parent->size += sizeof(struct tarfs_node);
+ tnp->parent = parent;
+ if (type == VDIR) {
+ parent->nlink++;
+ }
+ TARFS_NODE_UNLOCK(parent);
+ } else {
+ tnp->parent = tnp;
+ }
+ MPASS(tnp->ino != 0);
+
+ TARFS_ALLNODES_LOCK(tmp);
+ TAILQ_INSERT_TAIL(&tmp->allnodes, tnp, entries);
+ TARFS_ALLNODES_UNLOCK(tmp);
+
+ *retnode = tnp;
+ tmp->nfiles++;
+ return (0);
+}
+
+#define is09(ch) ((ch) >= '0' && (ch) <= '9')
+
+int
+tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize)
+{
+ struct tarfs_blk *blk = NULL;
+ char *map = NULL;
+ size_t nmap = 0, nblk = 0;
+ char *p, *q;
+ ssize_t res;
+ unsigned int i;
+ long n;
+
+ /*
+ * Load the entire map into memory. We don't know how big it is,
+ * but as soon as we start reading it we will know how many
+ * entries it contains, and then we can count newlines.
+ */
+ do {
+ nmap++;
+ if (tnp->size < nmap * TARFS_BLOCKSIZE) {
+ TARFS_DPF(MAP, "%s: map too large\n", __func__);
+ goto bad;
+ }
+ /* grow the map */
+ map = realloc(map, nmap * TARFS_BLOCKSIZE + 1, M_TARFSBLK,
+ M_ZERO | M_WAITOK);
+ /* read an additional block */
+ res = tarfs_io_read_buf(tnp->tmp, false,
+ map + (nmap - 1) * TARFS_BLOCKSIZE,
+ tnp->offset + (nmap - 1) * TARFS_BLOCKSIZE,
+ TARFS_BLOCKSIZE);
+ if (res < 0)
+ return (-res);
+ else if (res < TARFS_BLOCKSIZE)
+ return (EIO);
+ map[nmap * TARFS_BLOCKSIZE] = '\0'; /* sentinel */
+ if (nblk == 0) {
+ n = strtol(p = map, &q, 10);
+ if (q == p || *q != '\n' || n < 1)
+ goto syntax;
+ nblk = n;
+ }
+ for (n = 0, p = map; *p != '\0'; ++p) {
+ if (*p == '\n') {
+ ++n;
+ }
+ }
+ TARFS_DPF(MAP, "%s: %ld newlines in map\n", __func__, n);
+ } while (n < nblk * 2 + 1);
+ TARFS_DPF(MAP, "%s: block map length %zu\n", __func__, nblk);
+ blk = malloc(sizeof(*blk) * nblk, M_TARFSBLK, M_WAITOK | M_ZERO);
+ p = strchr(map, '\n') + 1;
+ for (i = 0; i < nblk; i++) {
+ if (i == 0)
+ blk[i].i = nmap * TARFS_BLOCKSIZE;
+ else
+ blk[i].i = blk[i - 1].i + blk[i - 1].l;
+ n = strtol(p, &q, 10);
+ if (q == p || *q != '\n' || n < 0)
+ goto syntax;
+ p = q + 1;
+ blk[i].o = n;
+ n = strtol(p, &q, 10);
+ if (q == p || *q != '\n' || n < 0)
+ goto syntax;
+ p = q + 1;
+ blk[i].l = n;
+ TARFS_DPF(MAP, "%s: %3d %12zu %12zu %12zu\n", __func__,
+ i, blk[i].i, blk[i].o, blk[i].l);
+ /*
+ * Check block alignment if the block is of non-zero
+ * length (a zero-length block indicates the end of a
+ * trailing hole). Checking i indirectly checks the
+ * previous block's l. It's ok for the final block to
+ * have an uneven length.
+ */
+ if (blk[i].l == 0) {
+ TARFS_DPF(MAP, "%s: zero-length block\n", __func__);
+ } else if (blk[i].i % TARFS_BLOCKSIZE != 0 ||
+ blk[i].o % TARFS_BLOCKSIZE != 0) {
+ TARFS_DPF(MAP, "%s: misaligned map entry\n", __func__);
+ goto bad;
+ }
+ /*
+ * Check that this block starts after the end of the
+ * previous one.
+ */
+ if (i > 0 && blk[i].o < blk[i - 1].o + blk[i - 1].l) {
+ TARFS_DPF(MAP, "%s: overlapping map entries\n", __func__);
+ goto bad;
+ }
+ /*
+ * Check that the block is within the file, both
+ * physically and logically.
+ */
+ if (blk[i].i + blk[i].l > tnp->physize ||
+ blk[i].o + blk[i].l > realsize) {
+ TARFS_DPF(MAP, "%s: map overflow\n", __func__);
+ goto bad;
+ }
+ }
+ free(map, M_TARFSBLK);
+
+ /* store in node */
+ free(tnp->blk, M_TARFSBLK);
+ tnp->nblk = nblk;
+ tnp->blk = blk;
+ tnp->size = realsize;
+ return (0);
+syntax:
+ TARFS_DPF(MAP, "%s: syntax error in block map\n", __func__);
+bad:
+ free(map, M_TARFSBLK);
+ free(blk, M_TARFSBLK);
+ return (EINVAL);
+}
+
+void
+tarfs_free_node(struct tarfs_node *tnp)
+{
+ struct tarfs_mount *tmp;
+
+ MPASS(tnp != NULL);
+ tmp = tnp->tmp;
+
+ switch (tnp->type) {
+ case VLNK:
+ if (tnp->link.name)
+ free(tnp->link.name, M_TARFSNAME);
+ break;
+ default:
+ break;
+ }
+ if (tnp->name != NULL)
+ free(tnp->name, M_TARFSNAME);
+ if (tnp->blk != NULL)
+ free(tnp->blk, M_TARFSBLK);
+ if (tnp->ino >= TARFS_MININO)
+ free_unr(tmp->ino_unr, tnp->ino);
+ free(tnp, M_TARFSNODE);
+ tmp->nfiles--;
+}
+
+int
+tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop)
+{
+ struct uio auio;
+ size_t resid = len;
+ size_t copylen;
+ unsigned int i;
+ int error;
+
+ TARFS_DPF(VNODE, "%s(%s, %zu, %zu)\n", __func__,
+ tnp->name, uiop->uio_offset, resid);
+ for (i = 0; i < tnp->nblk && resid > 0; ++i) {
+ if (uiop->uio_offset > tnp->blk[i].o + tnp->blk[i].l) {
+ /* skip this block */
+ continue;
+ }
+ while (resid > 0 &&
+ uiop->uio_offset < tnp->blk[i].o) {
+ /* move out some zeroes... */
+ copylen = tnp->blk[i].o - uiop->uio_offset;
+ if (copylen > resid)
+ copylen = resid;
+ if (copylen > ZERO_REGION_SIZE)
+ copylen = ZERO_REGION_SIZE;
+ auio = *uiop;
+ auio.uio_offset = 0;
+ auio.uio_resid = copylen;
+ error = uiomove(__DECONST(void *, zero_region),
+ copylen, &auio);
+ if (error != 0)
+ return (error);
+ TARFS_DPF(MAP, "%s(%s) = zero %zu\n", __func__,
+ tnp->name, copylen - auio.uio_resid);
+ uiop->uio_offset += copylen - auio.uio_resid;
+ uiop->uio_resid -= copylen - auio.uio_resid;
+ resid -= copylen - auio.uio_resid;
+ }
+ while (resid > 0 &&
+ uiop->uio_offset < tnp->blk[i].o + tnp->blk[i].l) {
+ /* now actual data */
+ copylen = tnp->blk[i].l;
+ if (copylen > resid)
+ copylen = resid;
+ auio = *uiop;
+ auio.uio_offset = tnp->offset + tnp->blk[i].i +
+ uiop->uio_offset - tnp->blk[i].o;
+ auio.uio_resid = copylen;
+ error = tarfs_io_read(tnp->tmp, false, &auio);
+ if (error != 0)
+ return (error);
+ TARFS_DPF(MAP, "%s(%s) = data %zu\n", __func__,
+ tnp->name, copylen - auio.uio_resid);
+ uiop->uio_offset += copylen - auio.uio_resid;
+ uiop->uio_resid -= copylen - auio.uio_resid;
+ resid -= copylen - auio.uio_resid;
+ }
+ }
+ TARFS_DPF(VNODE, "%s(%s) = %zu\n", __func__,
+ tnp->name, len - resid);
+ return (0);
+}
+
+/*
+ * XXX ugly file flag parser which could easily be a finite state machine
+ * driven by a small precomputed table.
+ *
+ * Note that unlike strtofflags(3), we make no attempt to handle negated
+ * flags, since they shouldn't appear in tar files.
+ */
+static const struct tarfs_flag {
+ const char *name;
+ unsigned int flag;
+} tarfs_flags[] = {
+ { "nodump", UF_NODUMP },
+ { "uchg", UF_IMMUTABLE },
+ { "uappnd", UF_APPEND },
+ { "opaque", UF_OPAQUE },
+ { "uunlnk", UF_NOUNLINK },
+ { "arch", SF_ARCHIVED },
+ { "schg", SF_IMMUTABLE },
+ { "sappnd", SF_APPEND },
+ { "sunlnk", SF_NOUNLINK },
+ { NULL, 0 },
+};
+
+unsigned int
+tarfs_strtofflags(const char *str, char **end)
+{
+ const struct tarfs_flag *tf;
+ const char *p, *q;
+ unsigned int ret;
+
+ ret = 0;
+ for (p = q = str; *q != '\0'; p = q + 1) {
+ for (q = p; *q != '\0' && *q != ','; ++q) {
+ if (*q < 'a' || *q > 'z') {
+ goto end;
+ }
+ /* nothing */
+ }
+ for (tf = tarfs_flags; tf->name != NULL; tf++) {
+ if (strncmp(tf->name, p, q - p) == 0 &&
+ tf->name[q - p] == '\0') {
+ TARFS_DPF(ALLOC, "%s: %.*s = 0x%06x\n", __func__,
+ (int)(q - p), p, tf->flag);
+ ret |= tf->flag;
+ break;
+ }
+ }
+ if (tf->name == NULL) {
+ TARFS_DPF(ALLOC, "%s: %.*s = 0x??????\n",
+ __func__, (int)(q - p), p);
+ goto end;
+ }
+ }
+end:
+ if (*end != NULL) {
+ *end = __DECONST(char *, q);
+ }
+ return (ret);
+}
diff --git a/sys/fs/tarfs/tarfs_vfsops.c b/sys/fs/tarfs/tarfs_vfsops.c
new file mode 100644
index 000000000000..fe135116c985
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_vfsops.c
@@ -0,0 +1,1173 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#include <vm/vm_param.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+CTASSERT(ZERO_REGION_SIZE > TARFS_BLOCKSIZE);
+
+struct ustar_header {
+ char name[100]; /* File name */
+ char mode[8]; /* Mode flags */
+ char uid[8]; /* User id */
+ char gid[8]; /* Group id */
+ char size[12]; /* Size */
+ char mtime[12]; /* Modified time */
+ char checksum[8]; /* Checksum */
+ char typeflag[1]; /* Type */
+ char linkname[100]; /* "old format" stops here */
+ char magic[6]; /* POSIX UStar "ustar\0" indicator */
+ char version[2]; /* POSIX UStar version "00" */
+ char uname[32]; /* User name */
+ char gname[32]; /* Group name */
+ char major[8]; /* Device major number */
+ char minor[8]; /* Device minor number */
+ char prefix[155]; /* Path prefix */
+};
+
+#define TAR_EOF ((off_t)-1)
+
+#define TAR_TYPE_FILE '0'
+#define TAR_TYPE_HARDLINK '1'
+#define TAR_TYPE_SYMLINK '2'
+#define TAR_TYPE_CHAR '3'
+#define TAR_TYPE_BLOCK '4'
+#define TAR_TYPE_DIRECTORY '5'
+#define TAR_TYPE_FIFO '6'
+#define TAR_TYPE_CONTIG '7'
+#define TAR_TYPE_GLOBAL_EXTHDR 'g'
+#define TAR_TYPE_EXTHDR 'x'
+#define TAR_TYPE_GNU_SPARSE 'S'
+
+#define USTAR_MAGIC (uint8_t []){ 'u', 's', 't', 'a', 'r', 0 }
+#define USTAR_VERSION (uint8_t []){ '0', '0' }
+#define GNUTAR_MAGIC (uint8_t []){ 'u', 's', 't', 'a', 'r', ' ' }
+#define GNUTAR_VERSION (uint8_t []){ ' ', '\x0' }
+
+#define DEFDIRMODE (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
+
+MALLOC_DEFINE(M_TARFSMNT, "tarfs mount", "tarfs mount structures");
+MALLOC_DEFINE(M_TARFSNODE, "tarfs node", "tarfs node structures");
+
+static vfs_mount_t tarfs_mount;
+static vfs_unmount_t tarfs_unmount;
+static vfs_root_t tarfs_root;
+static vfs_statfs_t tarfs_statfs;
+static vfs_fhtovp_t tarfs_fhtovp;
+
+static const char *tarfs_opts[] = {
+ "from", "gid", "mode", "uid", "verify",
+ NULL
+};
+
+/*
+ * Reads a len-width signed octal number from strp. Returns the value.
+ * XXX Does not report errors.
+ */
+static int64_t
+tarfs_str2octal(const char *strp, size_t len)
+{
+ int64_t val;
+ size_t idx;
+ int sign;
+
+ /*
+ * Skip leading spaces or tabs.
+ * XXX why? POSIX requires numeric fields to be 0-padded.
+ */
+ for (idx = 0; idx < len; idx++)
+ if (strp[idx] != ' ' && strp[idx] != '\t')
+ break;
+
+ if (idx == len)
+ return (0);
+
+ if (strp[idx] == '-') {
+ sign = -1;
+ idx++;
+ } else
+ sign = 1;
+
+ val = 0;
+ for (; idx < len; idx++) {
+ if (strp[idx] < '0' || strp[idx] > '7')
+ break;
+ val <<= 3;
+ val += (strp[idx] - '0');
+
+ /* Truncate on overflow */
+ if (val > INT64_MAX / 8) {
+ val = INT64_MAX;
+ break;
+ }
+ }
+
+ return (sign > 0) ? val : -val;
+}
+
+/*
+ * Reads a len-byte extended numeric value from strp. The first byte has
+ * bit 7 set to indicate the format; the remaining 7 bits + the (len - 1)
+ * bytes that follow form a big-endian signed two's complement binary
+ * number. Returns the value. XXX Does not report errors.
+ */
+static int64_t
+tarfs_str2base256(const char *strp, size_t len)
+{
+ int64_t val;
+ size_t idx;
+
+ KASSERT(strp[0] & 0x80, ("not an extended numeric value"));
+
+ /* Sign-extend the first byte */
+ if ((strp[0] & 0x40) != 0)
+ val = (int64_t)-1;
+ else
+ val = 0;
+ val <<= 6;
+ val |= (strp[0] & 0x3f);
+
+ /* Read subsequent bytes */
+ for (idx = 1; idx < len; idx++) {
+ val <<= 8;
+ val |= (0xff & (int64_t)strp[idx]);
+
+ /* Truncate on overflow and underflow */
+ if (val > INT64_MAX / 256) {
+ val = INT64_MAX;
+ break;
+ } else if (val < INT64_MAX / 256) {
+ val = INT64_MIN;
+ break;
+ }
+ }
+
+ return (val);
+}
+
+/*
+ * Read a len-byte numeric field from strp. If bit 7 of the first byte it
+ * set, assume an extended numeric value (signed two's complement);
+ * otherwise, assume a signed octal value.
+ *
+ * XXX practically no error checking or handling
+ */
+static int64_t
+tarfs_str2int64(const char *strp, size_t len)
+{
+
+ if (len < 1)
+ return (0);
+
+ if ((strp[0] & 0x80) != 0)
+ return (tarfs_str2base256(strp, len));
+ return (tarfs_str2octal(strp, len));
+}
+
+/*
+ * Verifies the checksum of a header. Returns true if the checksum is
+ * valid, false otherwise.
+ */
+static boolean_t
+tarfs_checksum(struct ustar_header *hdrp)
+{
+ const unsigned char *ptr;
+ int64_t checksum, hdrsum;
+ size_t idx;
+
+ hdrsum = tarfs_str2int64(hdrp->checksum, sizeof(hdrp->checksum));
+ TARFS_DPF(CHECKSUM, "%s: header checksum %lx\n", __func__, hdrsum);
+
+ checksum = 0;
+ for (ptr = (const unsigned char *)hdrp;
+ ptr < (const unsigned char *)hdrp->checksum; ptr++)
+ checksum += *ptr;
+ for (idx = 0; idx < sizeof(hdrp->checksum); idx++)
+ checksum += 0x20;
+ for (ptr = (const unsigned char *)hdrp->typeflag;
+ ptr < (const unsigned char *)(hdrp + 1); ptr++)
+ checksum += *ptr;
+ TARFS_DPF(CHECKSUM, "%s: calc unsigned checksum %lx\n", __func__,
+ checksum);
+ if (hdrsum == checksum)
+ return (true);
+
+ /*
+ * Repeat test with signed bytes, some older formats use a broken
+ * form of the calculation
+ */
+ checksum = 0;
+ for (ptr = (const unsigned char *)hdrp;
+ ptr < (const unsigned char *)&hdrp->checksum; ptr++)
+ checksum += *((const signed char *)ptr);
+ for (idx = 0; idx < sizeof(hdrp->checksum); idx++)
+ checksum += 0x20;
+ for (ptr = (const unsigned char *)&hdrp->typeflag;
+ ptr < (const unsigned char *)(hdrp + 1); ptr++)
+ checksum += *((const signed char *)ptr);
+ TARFS_DPF(CHECKSUM, "%s: calc signed checksum %lx\n", __func__,
+ checksum);
+ if (hdrsum == checksum)
+ return (true);
+
+ return (false);
+}
+
+
+/*
+ * Looks up a path in the tarfs node tree.
+ *
+ * - If the path exists, stores a pointer to the corresponding tarfs_node
+ * in retnode and a pointer to its parent in retparent.
+ *
+ * - If the path does not exist, but create_dirs is true, creates ancestor
+ * directories and returns NULL in retnode and the parent in retparent.
+ *
+ * - If the path does not exist and create_dirs is false, stops at the
+ * first missing path name component.
+ *
+ * - In all cases, on return, endp and sepp point to the beginning and
+ * end, respectively, of the last-processed path name component.
+ *
+ * - Returns 0 if the node was found, ENOENT if it was not, and some other
+ * positive errno value on failure.
+ */
+static int
+tarfs_lookup_path(struct tarfs_mount *tmp, char *name, size_t namelen,
+ char **endp, char **sepp, struct tarfs_node **retparent,
+ struct tarfs_node **retnode, boolean_t create_dirs)
+{
+ struct componentname cn;
+ struct tarfs_node *parent, *tnp;
+ char *sep;
+ size_t len;
+ int error;
+ boolean_t do_lookup;
+
+ MPASS(name != NULL && namelen != 0);
+
+ do_lookup = true;
+ error = 0;
+ parent = tnp = tmp->root;
+ if (tnp == NULL)
+ panic("%s: root node not yet created", __func__);
+
+ bzero(&cn, sizeof(cn));
+
+ TARFS_DPF(LOOKUP, "%s: Full path: %.*s\n", __func__, (int)namelen,
+ name);
+
+ sep = NULL;
+ for (;;) {
+ /* skip leading slash(es) */
+ while (name[0] == '/' && namelen > 0)
+ name++, namelen--;
+
+ /* did we reach the end? */
+ if (namelen == 0 || name[0] == '\0') {
+ name = do_lookup ? NULL : cn.cn_nameptr;
+ namelen = do_lookup ? 0 : cn.cn_namelen;
+ break;
+ }
+
+ /* locate the next separator */
+ for (sep = name, len = 0;
+ *sep != '\0' && *sep != '/' && len < namelen;
+ sep++, len++)
+ /* nothing */ ;
+
+ /* check for . and .. */
+ if (name[0] == '.' && len <= 2) {
+ if (len == 1) {
+ /* . */
+ name += len;
+ namelen -= len;
+ continue;
+ } else if (name[1] == '.') {
+ /* .. */
+ if (tnp == tmp->root) {
+ error = EINVAL;
+ break;
+ }
+ tnp = tnp->parent;
+ parent = tnp->parent;
+ name += len;
+ namelen -= len;
+ continue;
+ }
+ }
+
+ /* create parent if necessary */
+ if (!do_lookup) {
+ TARFS_DPF(ALLOC, "%s: creating %.*s\n", __func__,
+ (int)cn.cn_namelen, cn.cn_nameptr);
+ error = tarfs_alloc_node(tmp, cn.cn_nameptr,
+ cn.cn_namelen, VDIR, -1, 0, tmp->mtime, 0, 0,
+ DEFDIRMODE, 0, NULL, NODEV, parent, &tnp);
+ if (error != 0)
+ break;
+ }
+
+ parent = tnp;
+ tnp = NULL;
+ cn.cn_nameptr = name;
+ cn.cn_namelen = len;
+ TARFS_DPF(LOOKUP, "%s: Search: %.*s\n", __func__,
+ (int)cn.cn_namelen, cn.cn_nameptr);
+ if (do_lookup) {
+ tnp = tarfs_lookup_node(parent, NULL, &cn);
+ if (tnp == NULL) {
+ do_lookup = false;
+ if (!create_dirs)
+ break;
+ }
+ }
+ name += cn.cn_namelen;
+ namelen -= cn.cn_namelen;
+ }
+
+ TARFS_DPF(LOOKUP, "%s: Parent %p, node %p\n", __func__, parent, tnp);
+
+ if (retparent)
+ *retparent = parent;
+ if (retnode)
+ *retnode = tnp;
+ if (endp) {
+ if (namelen > 0)
+ *endp = name;
+ else
+ *endp = NULL;
+ }
+ if (sepp)
+ *sepp = sep;
+ return (error);
+}
+
+/*
+ * Frees a tarfs_mount structure and everything it references.
+ */
+static void
+tarfs_free_mount(struct tarfs_mount *tmp)
+{
+ struct mount *mp;
+ struct tarfs_node *tnp;
+
+ MPASS(tmp != NULL);
+
+ TARFS_DPF(ALLOC, "%s: Freeing mount structure %p\n", __func__, tmp);
+
+ TARFS_DPF(ALLOC, "%s: freeing tarfs_node structures\n", __func__);
+ while (!TAILQ_EMPTY(&tmp->allnodes)) {
+ tnp = TAILQ_FIRST(&tmp->allnodes);
+ TAILQ_REMOVE(&tmp->allnodes, tnp, entries);
+ tarfs_free_node(tnp);
+ }
+
+ (void)tarfs_io_fini(tmp);
+
+ TARFS_DPF(ALLOC, "%s: deleting unr header\n", __func__);
+ delete_unrhdr(tmp->ino_unr);
+ mp = tmp->vfs;
+ mp->mnt_data = NULL;
+
+ TARFS_DPF(ALLOC, "%s: freeing structure\n", __func__);
+ free(tmp, M_TARFSMNT);
+}
+
+/*
+ * Processes the tar file header at block offset blknump and allocates and
+ * populates a tarfs_node structure for the file it describes. Updated
+ * blknump to point to the next unread tar file block, or TAR_EOF if EOF
+ * is reached. Returns 0 on success or EOF and a positive errno value on
+ * failure.
+ */
+static int
+tarfs_alloc_one(struct tarfs_mount *tmp, off_t *blknump)
+{
+ char block[TARFS_BLOCKSIZE];
+ struct ustar_header *hdrp = (struct ustar_header *)block;
+ struct sbuf *namebuf = NULL;
+ char *exthdr = NULL, *name = NULL, *link = NULL;
+ off_t blknum = *blknump;
+ int endmarker = 0;
+ char *namep, *sep;
+ struct tarfs_node *parent, *tnp;
+ size_t namelen = 0, linklen = 0, realsize = 0, sz;
+ ssize_t res;
+ dev_t rdev;
+ gid_t gid;
+ mode_t mode;
+ time_t mtime;
+ uid_t uid;
+ long major = -1, minor = -1;
+ unsigned int flags = 0;
+ int error;
+ boolean_t sparse = false;
+
+again:
+ /* read next header */
+ res = tarfs_io_read_buf(tmp, false, block,
+ TARFS_BLOCKSIZE * blknum, TARFS_BLOCKSIZE);
+ if (res < 0) {
+ error = -res;
+ goto bad;
+ } else if (res < TARFS_BLOCKSIZE) {
+ goto eof;
+ }
+ blknum++;
+
+ /* check for end marker */
+ if (memcmp(block, zero_region, TARFS_BLOCKSIZE) == 0) {
+ if (endmarker++) {
+ if (exthdr != NULL) {
+ TARFS_DPF(IO, "%s: orphaned extended header at %zu\n",
+ __func__, TARFS_BLOCKSIZE * (blknum - 1));
+ free(exthdr, M_TEMP);
+ }
+ TARFS_DPF(IO, "%s: end of archive at %zu\n", __func__,
+ TARFS_BLOCKSIZE * blknum);
+ tmp->nblocks = blknum;
+ *blknump = TAR_EOF;
+ return (0);
+ }
+ goto again;
+ }
+
+ /* verify magic */
+ if (memcmp(hdrp->magic, USTAR_MAGIC, sizeof(USTAR_MAGIC)) == 0 &&
+ memcmp(hdrp->version, USTAR_VERSION, sizeof(USTAR_VERSION)) == 0) {
+ /* POSIX */
+ } else if (memcmp(hdrp->magic, GNUTAR_MAGIC, sizeof(GNUTAR_MAGIC)) == 0 &&
+ memcmp(hdrp->magic, GNUTAR_MAGIC, sizeof(GNUTAR_MAGIC)) == 0) {
+ TARFS_DPF(ALLOC, "%s: GNU tar format at %zu\n", __func__,
+ TARFS_BLOCKSIZE * (blknum - 1));
+ error = EFTYPE;
+ goto bad;
+ } else {
+ TARFS_DPF(ALLOC, "%s: unsupported TAR format at %zu\n",
+ __func__, TARFS_BLOCKSIZE * (blknum - 1));
+ error = EINVAL;
+ goto bad;
+ }
+
+ /* verify checksum */
+ if (!tarfs_checksum(hdrp)) {
+ TARFS_DPF(ALLOC, "%s: header checksum failed at %zu\n",
+ __func__, TARFS_BLOCKSIZE * (blknum - 1));
+ error = EINVAL;
+ goto bad;
+ }
+
+ /* get standard attributes */
+ mode = tarfs_str2int64(hdrp->mode, sizeof(hdrp->mode));
+ uid = tarfs_str2int64(hdrp->uid, sizeof(hdrp->uid));
+ gid = tarfs_str2int64(hdrp->gid, sizeof(hdrp->gid));
+ sz = tarfs_str2int64(hdrp->size, sizeof(hdrp->size));
+ mtime = tarfs_str2int64(hdrp->mtime, sizeof(hdrp->mtime));
+ rdev = NODEV;
+ TARFS_DPF(ALLOC, "%s: [%c] %zu @%jd %o %d:%d\n", __func__,
+ hdrp->typeflag[0], sz, (intmax_t)mtime, mode, uid, gid);
+
+ /* extended header? */
+ if (hdrp->typeflag[0] == TAR_TYPE_GLOBAL_EXTHDR) {
+ printf("%s: unsupported global extended header at %zd\n",
+ __func__, TARFS_BLOCKSIZE * (blknum - 1));
+ error = EFTYPE;
+ goto bad;
+ }
+ if (hdrp->typeflag[0] == TAR_TYPE_EXTHDR) {
+ if (exthdr != NULL) {
+ TARFS_DPF(IO, "%s: multiple extended headers at %zu\n",
+ __func__, TARFS_BLOCKSIZE * (blknum - 1));
+ error = EFTYPE;
+ goto bad;
+ }
+ /* read the contents of the exthdr */
+ TARFS_DPF(ALLOC, "%s: %zu-byte extended header at %zd\n",
+ __func__, sz, TARFS_BLOCKSIZE * (blknum - 1));
+ exthdr = malloc(sz, M_TEMP, M_WAITOK);
+ res = tarfs_io_read_buf(tmp, false, exthdr,
+ TARFS_BLOCKSIZE * blknum, sz);
+ if (res < 0) {
+ error = -res;
+ goto bad;
+ }
+ if (res < sz) {
+ goto eof;
+ }
+ blknum += TARFS_SZ2BLKS(res);
+ /* XXX TODO: refactor this parser */
+ char *line = exthdr;
+ while (line < exthdr + sz) {
+ char *eol, *key, *value, *sep;
+ size_t len = strtoul(line, &sep, 10);
+ if (len == 0 || sep == line || *sep != ' ') {
+ TARFS_DPF(ALLOC, "%s: exthdr syntax error\n",
+ __func__);
+ error = EINVAL;
+ goto bad;
+ }
+ if (line + len > exthdr + sz) {
+ TARFS_DPF(ALLOC, "%s: exthdr overflow\n",
+ __func__);
+ error = EINVAL;
+ goto bad;
+ }
+ eol = line + len - 1;
+ *eol = '\0';
+ line += len;
+ key = sep + 1;
+ sep = strchr(key, '=');
+ if (sep == NULL) {
+ TARFS_DPF(ALLOC, "%s: exthdr syntax error\n",
+ __func__);
+ error = EINVAL;
+ goto bad;
+ }
+ *sep = '\0';
+ value = sep + 1;
+ TARFS_DPF(ALLOC, "%s: exthdr %s=%s\n", __func__,
+ key, value);
+ if (strcmp(key, "linkpath") == 0) {
+ link = value;
+ linklen = eol - value;
+ } else if (strcmp(key, "GNU.sparse.major") == 0) {
+ sparse = true;
+ major = strtol(value, &sep, 10);
+ if (sep != eol) {
+ printf("exthdr syntax error\n");
+ error = EINVAL;
+ goto bad;
+ }
+ } else if (strcmp(key, "GNU.sparse.minor") == 0) {
+ sparse = true;
+ minor = strtol(value, &sep, 10);
+ if (sep != eol) {
+ printf("exthdr syntax error\n");
+ error = EINVAL;
+ goto bad;
+ }
+ } else if (strcmp(key, "GNU.sparse.name") == 0) {
+ sparse = true;
+ name = value;
+ namelen = eol - value;
+ if (namelen == 0) {
+ printf("exthdr syntax error\n");
+ error = EINVAL;
+ goto bad;
+ }
+ } else if (strcmp(key, "GNU.sparse.realsize") == 0) {
+ sparse = true;
+ realsize = strtoul(value, &sep, 10);
+ if (sep != eol) {
+ printf("exthdr syntax error\n");
+ error = EINVAL;
+ goto bad;
+ }
+ } else if (strcmp(key, "SCHILY.fflags") == 0) {
+ flags |= tarfs_strtofflags(value, &sep);
+ if (sep != eol) {
+ printf("exthdr syntax error\n");
+ error = EINVAL;
+ goto bad;
+ }
+ }
+ }
+ goto again;
+ }
+
+ /* sparse file consistency checks */
+ if (sparse) {
+ TARFS_DPF(ALLOC, "%s: %s: sparse %ld.%ld (%zu bytes)\n", __func__,
+ name, major, minor, realsize);
+ if (major != 1 || minor != 0 || name == NULL || realsize == 0 ||
+ hdrp->typeflag[0] != TAR_TYPE_FILE) {
+ TARFS_DPF(ALLOC, "%s: invalid sparse format\n", __func__);
+ error = EINVAL;
+ goto bad;
+ }
+ }
+
+ /* file name */
+ if (name == NULL) {
+ if (hdrp->prefix[0] != '\0') {
+ namebuf = sbuf_new_auto();
+ sbuf_printf(namebuf, "%.*s/%.*s",
+ (int)sizeof(hdrp->prefix), hdrp->prefix,
+ (int)sizeof(hdrp->name), hdrp->name);
+ sbuf_finish(namebuf);
+ name = sbuf_data(namebuf);
+ namelen = sbuf_len(namebuf);
+ } else {
+ name = hdrp->name;
+ namelen = strnlen(hdrp->name, sizeof(hdrp->name));
+ }
+ }
+
+ error = tarfs_lookup_path(tmp, name, namelen, &namep,
+ &sep, &parent, &tnp, true);
+ if (error != 0)
+ goto bad;
+ if (tnp != NULL) {
+ if (hdrp->typeflag[0] == TAR_TYPE_DIRECTORY) {
+ /* XXX set attributes? */
+ goto skip;
+ }
+ TARFS_DPF(ALLOC, "%s: duplicate file %.*s\n", __func__,
+ (int)namelen, name);
+ error = EINVAL;
+ goto bad;
+ }
+ switch (hdrp->typeflag[0]) {
+ case TAR_TYPE_DIRECTORY:
+ error = tarfs_alloc_node(tmp, namep, sep - namep, VDIR,
+ 0, 0, mtime, uid, gid, mode, flags, NULL, 0,
+ parent, &tnp);
+ break;
+ case TAR_TYPE_FILE:
+ error = tarfs_alloc_node(tmp, namep, sep - namep, VREG,
+ blknum * TARFS_BLOCKSIZE, sz, mtime, uid, gid, mode,
+ flags, NULL, 0, parent, &tnp);
+ if (error == 0 && sparse) {
+ error = tarfs_load_blockmap(tnp, realsize);
+ }
+ break;
+ case TAR_TYPE_HARDLINK:
+ if (link == NULL) {
+ link = hdrp->linkname;
+ linklen = strnlen(link, sizeof(hdrp->linkname));
+ }
+ error = tarfs_alloc_node(tmp, namep, sep - namep, VREG,
+ 0, 0, 0, 0, 0, 0, 0, NULL, 0, parent, &tnp);
+ if (error != 0) {
+ goto bad;
+ }
+ error = tarfs_lookup_path(tmp, link, linklen, NULL,
+ NULL, NULL, &tnp->other, false);
+ if (tnp->other == NULL ||
+ tnp->other->type != VREG ||
+ tnp->other->other != NULL) {
+ TARFS_DPF(ALLOC, "%s: %.*s: dead hard link to %.*s\n",
+ __func__, (int)namelen, name, (int)linklen, link);
+ error = EINVAL;
+ goto bad;
+ }
+ break;
+ case TAR_TYPE_SYMLINK:
+ if (link == NULL) {
+ link = hdrp->linkname;
+ linklen = strnlen(link, sizeof(hdrp->linkname));
+ }
+ error = tarfs_alloc_node(tmp, namep, sep - namep, VLNK,
+ 0, linklen, mtime, uid, gid, mode, flags, link, 0,
+ parent, &tnp);
+ break;
+ case TAR_TYPE_BLOCK:
+ major = tarfs_str2int64(hdrp->major, sizeof(hdrp->major));
+ minor = tarfs_str2int64(hdrp->minor, sizeof(hdrp->minor));
+ rdev = makedev(major, minor);
+ error = tarfs_alloc_node(tmp, namep, sep - namep, VBLK,
+ 0, 0, mtime, uid, gid, mode, flags, NULL, rdev,
+ parent, &tnp);
+ break;
+ case TAR_TYPE_CHAR:
+ major = tarfs_str2int64(hdrp->major, sizeof(hdrp->major));
+ minor = tarfs_str2int64(hdrp->minor, sizeof(hdrp->minor));
+ rdev = makedev(major, minor);
+ error = tarfs_alloc_node(tmp, namep, sep - namep, VCHR,
+ 0, 0, mtime, uid, gid, mode, flags, NULL, rdev,
+ parent, &tnp);
+ break;
+ default:
+ TARFS_DPF(ALLOC, "%s: unsupported type %c for %.*s\n",
+ __func__, hdrp->typeflag[0], (int)namelen, name);
+ error = EINVAL;
+ break;
+ }
+ if (error != 0)
+ goto bad;
+
+skip:
+ blknum += TARFS_SZ2BLKS(sz);
+ tmp->nblocks = blknum;
+ *blknump = blknum;
+ if (exthdr != NULL) {
+ free(exthdr, M_TEMP);
+ }
+ if (namebuf != NULL) {
+ sbuf_delete(namebuf);
+ }
+ return (0);
+eof:
+ TARFS_DPF(IO, "%s: premature end of file\n", __func__);
+ error = EIO;
+ goto bad;
+bad:
+ if (exthdr != NULL) {
+ free(exthdr, M_TEMP);
+ }
+ if (namebuf != NULL) {
+ sbuf_delete(namebuf);
+ }
+ return (error);
+}
+
+/*
+ * Allocates and populates the metadata structures for the tar file
+ * referenced by vp. On success, a pointer to the tarfs_mount structure
+ * is stored in tmpp. Returns 0 on success or a positive errno value on
+ * failure.
+ */
+static int
+tarfs_alloc_mount(struct mount *mp, struct vnode *vp,
+ uid_t root_uid, gid_t root_gid, mode_t root_mode,
+ struct tarfs_mount **tmpp)
+{
+ struct vattr va;
+ struct thread *td = curthread;
+ char *fullpath;
+ struct tarfs_mount *tmp;
+ struct tarfs_node *root;
+ off_t blknum;
+ time_t mtime;
+ int error;
+
+ KASSERT(tmpp != NULL, ("tarfs mount return is NULL"));
+ ASSERT_VOP_LOCKED(vp, __func__);
+
+ tmp = NULL;
+ fullpath = NULL;
+
+ TARFS_DPF(ALLOC, "%s: Allocating tarfs mount structure for vp %p\n",
+ __func__, vp);
+
+ /* Get source metadata */
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error != 0) {
+ return (error);
+ }
+ VOP_UNLOCK(vp);
+ mtime = va.va_mtime.tv_sec;
+
+ /* Allocate and initialize tarfs mount structure */
+ tmp = (struct tarfs_mount *)malloc(sizeof(struct tarfs_mount),
+ M_TARFSMNT, M_WAITOK | M_ZERO);
+ TARFS_DPF(ALLOC, "%s: Allocated mount structure\n", __func__);
+ mp->mnt_data = tmp;
+
+ mtx_init(&tmp->allnode_lock, "tarfs allnode lock", NULL,
+ MTX_DEF);
+ TAILQ_INIT(&tmp->allnodes);
+ tmp->ino_unr = new_unrhdr(TARFS_MININO, INT_MAX, &tmp->allnode_lock);
+ tmp->vp = vp;
+ tmp->vfs = mp;
+ tmp->mtime = mtime;
+
+ /*
+ * XXX The decompression layer passes everything through the
+ * buffer cache, and the buffer cache wants to know our blocksize,
+ * but mnt_stat normally isn't populated until after we return, so
+ * we have to cheat a bit.
+ */
+ tmp->iosize = 1U << tarfs_ioshift;
+ mp->mnt_stat.f_iosize = tmp->iosize;
+
+ /* Initialize decompression layer */
+ error = tarfs_io_init(tmp);
+ if (error != 0)
+ goto bad;
+
+ error = tarfs_alloc_node(tmp, NULL, 0, VDIR, 0, 0, mtime, root_uid,
+ root_gid, root_mode & ALLPERMS, 0, NULL, NODEV, NULL, &root);
+ if (error != 0 || root == NULL)
+ goto bad;
+ tmp->root = root;
+
+ blknum = 0;
+ do {
+ if ((error = tarfs_alloc_one(tmp, &blknum)) != 0) {
+ goto bad;
+ }
+ } while (blknum != TAR_EOF);
+
+ *tmpp = tmp;
+
+ TARFS_DPF(ALLOC, "%s: pfsmnt_root %p\n", __func__, tmp->root);
+ return (0);
+
+bad:
+ if (tmp != NULL)
+ tarfs_free_mount(tmp);
+ free(fullpath, M_TEMP);
+ return (error);
+}
+
+/*
+ * VFS Operations.
+ */
+
+static int
+tarfs_mount(struct mount *mp)
+{
+ struct nameidata nd;
+ struct vattr va;
+ struct tarfs_mount *tmp = NULL;
+ struct thread *td = curthread;
+ struct vnode *vp;
+ char *from;
+ uid_t root_uid;
+ gid_t root_gid;
+ mode_t root_mode;
+ int error, flags, len;
+
+ if (mp->mnt_flag & MNT_UPDATE)
+ return (EOPNOTSUPP);
+
+ if (vfs_filteropt(mp->mnt_optnew, tarfs_opts))
+ return (EINVAL);
+
+ vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred);
+ VOP_UNLOCK(mp->mnt_vnodecovered);
+ if (error)
+ return (error);
+
+ if (mp->mnt_cred->cr_ruid != 0 ||
+ vfs_scanopt(mp->mnt_optnew, "gid", "%d", &root_gid) != 1)
+ root_gid = va.va_gid;
+ if (mp->mnt_cred->cr_ruid != 0 ||
+ vfs_scanopt(mp->mnt_optnew, "uid", "%d", &root_uid) != 1)
+ root_uid = va.va_uid;
+ if (mp->mnt_cred->cr_ruid != 0 ||
+ vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1)
+ root_mode = va.va_mode;
+
+ error = vfs_getopt(mp->mnt_optnew, "from", (void **)&from, &len);
+ if (error != 0 || from[len - 1] != '\0')
+ return (EINVAL);
+
+ /* Find the source tarball */
+ TARFS_DPF(FS, "%s(%s, uid=%u, gid=%u, mode=%o)\n", __func__,
+ from, root_uid, root_gid, root_mode);
+ flags = FREAD;
+ if (vfs_flagopt(mp->mnt_optnew, "verify", NULL, 0)) {
+ flags |= O_VERIFY;
+ }
+ NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF, UIO_SYSSPACE, from);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ NDFREE_PNBUF(&nd);
+ vp = nd.ni_vp;
+ TARFS_DPF(FS, "%s: N: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ /* vp is now held and locked */
+
+ /* Open the source tarball */
+ error = vn_open_vnode(vp, flags, td->td_ucred, td, NULL);
+ if (error != 0) {
+ TARFS_DPF(FS, "%s: failed to open %s: %d\n", __func__,
+ from, error);
+ vput(vp);
+ goto bad;
+ }
+ TARFS_DPF(FS, "%s: O: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ if (vp->v_type != VREG) {
+ TARFS_DPF(FS, "%s: not a regular file\n", __func__);
+ error = EOPNOTSUPP;
+ goto bad_open_locked;
+ }
+ error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+ if (error != 0) {
+ TARFS_DPF(FS, "%s: not permitted to mount\n", __func__);
+ goto bad_open_locked;
+ }
+ if (flags & O_VERIFY) {
+ mp->mnt_flag |= MNT_VERIFIED;
+ }
+
+ /* Allocate the tarfs mount */
+ error = tarfs_alloc_mount(mp, vp, root_uid, root_gid, root_mode, &tmp);
+ /* vp is now held but unlocked */
+ if (error != 0) {
+ TARFS_DPF(FS, "%s: failed to mount %s: %d\n", __func__,
+ from, error);
+ goto bad_open_unlocked;
+ }
+ TARFS_DPF(FS, "%s: M: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+
+ /* Unconditionally mount as read-only */
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= (MNT_LOCAL | MNT_RDONLY);
+ MNT_IUNLOCK(mp);
+
+ vfs_getnewfsid(mp);
+ vfs_mountedfrom(mp, "tarfs");
+ TARFS_DPF(FS, "%s: success\n", __func__);
+
+ return (0);
+
+bad_open_locked:
+ /* vp must be held and locked */
+ TARFS_DPF(FS, "%s: L: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ VOP_UNLOCK(vp);
+bad_open_unlocked:
+ /* vp must be held and unlocked */
+ TARFS_DPF(FS, "%s: E: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ (void)vn_close(vp, flags, td->td_ucred, td);
+bad:
+ /* vp must be released and unlocked */
+ TARFS_DPF(FS, "%s: X: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ return (error);
+}
+
+/*
+ * Unmounts a tarfs filesystem.
+ */
+static int
+tarfs_unmount(struct mount *mp, int mntflags)
+{
+ struct thread *td = curthread;
+ struct tarfs_mount *tmp;
+ struct vnode *vp;
+ int error;
+ int flags = 0;
+
+ TARFS_DPF(FS, "%s: Unmounting %p\n", __func__, mp);
+
+ /* Handle forced unmounts */
+ if (mntflags & MNT_FORCE)
+ flags |= FORCECLOSE;
+
+ /* Finalize all pending I/O */
+ error = vflush(mp, 0, flags, curthread);
+ if (error != 0)
+ return (error);
+ tmp = MP_TO_TARFS_MOUNT(mp);
+ vp = tmp->vp;
+
+ MPASS(vp != NULL);
+ TARFS_DPF(FS, "%s: U: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ vn_close(vp, FREAD, td->td_ucred, td);
+ TARFS_DPF(FS, "%s: C: hold %u use %u lock 0x%x\n", __func__,
+ vp->v_holdcnt, vp->v_usecount, VOP_ISLOCKED(vp));
+ tarfs_free_mount(tmp);
+
+ return (0);
+}
+
+/*
+ * Gets the root of a tarfs filesystem. Returns 0 on success or a
+ * positive errno value on failure.
+ */
+static int
+tarfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+ struct vnode *nvp;
+ int error;
+
+ TARFS_DPF(FS, "%s: Getting root vnode\n", __func__);
+
+ error = VFS_VGET(mp, TARFS_ROOTINO, LK_EXCLUSIVE, &nvp);
+ if (error != 0)
+ return (error);
+
+ nvp->v_vflag |= VV_ROOT;
+ *vpp = nvp;
+ return (0);
+}
+
+/*
+ * Gets statistics for a tarfs filesystem. Returns 0.
+ */
+static int
+tarfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+ struct tarfs_mount *tmp;
+
+ tmp = MP_TO_TARFS_MOUNT(mp);
+
+ sbp->f_bsize = TARFS_BLOCKSIZE;
+ sbp->f_iosize = tmp->iosize;
+ sbp->f_blocks = tmp->nblocks;
+ sbp->f_bfree = 0;
+ sbp->f_bavail = 0;
+ sbp->f_files = tmp->nfiles;
+ sbp->f_ffree = 0;
+
+ return (0);
+}
+
+/*
+ * Gets a vnode for the given inode. On success, a pointer to the vnode
+ * is stored in vpp. Returns 0 on success or a positive errno value on
+ * failure.
+ */
+static int
+tarfs_vget(struct mount *mp, ino_t ino, int lkflags, struct vnode **vpp)
+{
+ struct tarfs_mount *tmp;
+ struct tarfs_node *tnp;
+ struct thread *td;
+ struct vnode *vp;
+ int error;
+
+ TARFS_DPF(FS, "%s: mp %p, ino %lu, lkflags %d\n", __func__, mp, ino,
+ lkflags);
+
+ td = curthread;
+ error = vfs_hash_get(mp, ino, lkflags, td, vpp, NULL, NULL);
+ if (error != 0)
+ return (error);
+
+ if (*vpp != NULL) {
+ TARFS_DPF(FS, "%s: found hashed vnode %p\n", __func__, *vpp);
+ return (error);
+ }
+
+ TARFS_DPF(FS, "%s: no hashed vnode for inode %lu\n", __func__, ino);
+
+ tmp = MP_TO_TARFS_MOUNT(mp);
+
+ if (ino == TARFS_ZIOINO) {
+ error = vget(tmp->znode, lkflags);
+ if (error != 0)
+ return (error);
+ *vpp = tmp->znode;
+ return (0);
+ }
+
+ /* XXX Should use hash instead? */
+ TAILQ_FOREACH(tnp, &tmp->allnodes, entries) {
+ if (tnp->ino == ino)
+ break;
+ }
+ TARFS_DPF(FS, "%s: search of all nodes found %p\n", __func__, tnp);
+ if (tnp == NULL)
+ return (ENOENT);
+
+ error = getnewvnode("tarfs", mp, &tarfs_vnodeops, &vp);
+ if (error != 0)
+ goto bad;
+ TARFS_DPF(FS, "%s: allocated vnode\n", __func__);
+ vp->v_data = tnp;
+ vp->v_type = tnp->type;
+ tnp->vnode = vp;
+
+ lockmgr(vp->v_vnlock, lkflags, NULL);
+ error = insmntque(vp, mp);
+ if (error != 0)
+ goto bad;
+ TARFS_DPF(FS, "%s: inserting entry into VFS hash\n", __func__);
+ error = vfs_hash_insert(vp, ino, lkflags, td, vpp, NULL, NULL);
+ if (error != 0 || *vpp != NULL)
+ return (error);
+
+ vn_set_state(vp, VSTATE_CONSTRUCTED);
+ *vpp = vp;
+ return (0);
+
+bad:
+ *vpp = NULLVP;
+ return (error);
+}
+
+static int
+tarfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
+{
+ struct tarfs_node *tnp;
+ struct tarfs_fid *tfp;
+ struct vnode *nvp;
+ int error;
+
+ tfp = (struct tarfs_fid *)fhp;
+ MP_TO_TARFS_MOUNT(mp);
+ if (tfp->ino < TARFS_ROOTINO || tfp->ino > INT_MAX)
+ return (ESTALE);
+
+ error = VFS_VGET(mp, tfp->ino, LK_EXCLUSIVE, &nvp);
+ if (error != 0) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ tnp = VP_TO_TARFS_NODE(nvp);
+ if (tnp->mode == 0 ||
+ tnp->gen != tfp->gen ||
+ tnp->nlink <= 0) {
+ vput(nvp);
+ *vpp = NULLVP;
+ return (ESTALE);
+ }
+ *vpp = nvp;
+ return (0);
+}
+
+static struct vfsops tarfs_vfsops = {
+ .vfs_fhtovp = tarfs_fhtovp,
+ .vfs_mount = tarfs_mount,
+ .vfs_root = tarfs_root,
+ .vfs_statfs = tarfs_statfs,
+ .vfs_unmount = tarfs_unmount,
+ .vfs_vget = tarfs_vget,
+};
+VFS_SET(tarfs_vfsops, tarfs, VFCF_READONLY);
+MODULE_VERSION(tarfs, 1);
+MODULE_DEPEND(tarfs, xz, 1, 1, 1);
diff --git a/sys/fs/tarfs/tarfs_vnops.c b/sys/fs/tarfs/tarfs_vnops.c
new file mode 100644
index 000000000000..a40499982229
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_vnops.c
@@ -0,0 +1,642 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/dirent.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+static int
+tarfs_open(struct vop_open_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct vnode *vp;
+
+ vp = ap->a_vp;
+ MPASS(VOP_ISLOCKED(vp));
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ TARFS_DPF(VNODE, "%s(%p=%s, %o)\n", __func__,
+ tnp, tnp->name, ap->a_mode);
+
+ if (vp->v_type != VREG && vp->v_type != VDIR)
+ return (EOPNOTSUPP);
+
+ vnode_create_vobject(vp, tnp->size, ap->a_td);
+ return (0);
+}
+
+static int
+tarfs_close(struct vop_close_args *ap)
+{
+#ifdef TARFS_DEBUG
+ struct tarfs_node *tnp;
+ struct vnode *vp;
+
+ vp = ap->a_vp;
+
+ MPASS(VOP_ISLOCKED(vp));
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ TARFS_DPF(VNODE, "%s(%p=%s)\n", __func__,
+ tnp, tnp->name);
+#else
+ (void)ap;
+#endif
+ return (0);
+}
+
+static int
+tarfs_access(struct vop_access_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct vnode *vp;
+ accmode_t accmode;
+ struct ucred *cred;
+ int error;
+
+ vp = ap->a_vp;
+ accmode = ap->a_accmode;
+ cred = ap->a_cred;
+
+ MPASS(VOP_ISLOCKED(vp));
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ TARFS_DPF(VNODE, "%s(%p=%s, %o)\n", __func__,
+ tnp, tnp->name, accmode);
+
+ switch (vp->v_type) {
+ case VDIR:
+ case VLNK:
+ case VREG:
+ if ((accmode & VWRITE) != 0)
+ return (EROFS);
+ break;
+ case VBLK:
+ case VCHR:
+ case VFIFO:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if ((accmode & VWRITE) != 0)
+ return (EPERM);
+
+ error = vaccess(vp->v_type, tnp->mode, tnp->uid,
+ tnp->gid, accmode, cred);
+ return (error);
+}
+
+static int
+tarfs_getattr(struct vop_getattr_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct vnode *vp;
+ struct vattr *vap;
+
+ vp = ap->a_vp;
+ vap = ap->a_vap;
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ TARFS_DPF(VNODE, "%s(%p=%s)\n", __func__,
+ tnp, tnp->name);
+
+ vap->va_type = vp->v_type;
+ vap->va_mode = tnp->mode;
+ vap->va_nlink = tnp->nlink;
+ vap->va_gid = tnp->gid;
+ vap->va_uid = tnp->uid;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_fileid = tnp->ino;
+ vap->va_size = tnp->size;
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+ vap->va_atime = tnp->atime;
+ vap->va_ctime = tnp->ctime;
+ vap->va_mtime = tnp->mtime;
+ vap->va_birthtime = tnp->birthtime;
+ vap->va_gen = tnp->gen;
+ vap->va_flags = tnp->flags;
+ vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ?
+ tnp->rdev : NODEV;
+ vap->va_bytes = round_page(tnp->physize);
+ vap->va_filerev = 0;
+
+ return (0);
+}
+
+static int
+tarfs_lookup(struct vop_cachedlookup_args *ap)
+{
+ struct tarfs_node *dirnode, *parent, *tnp;
+ struct componentname *cnp;
+ struct vnode *dvp, **vpp;
+#ifdef TARFS_DEBUG
+ struct vnode *vp;
+#endif
+ int error;
+
+ dvp = ap->a_dvp;
+ vpp = ap->a_vpp;
+ cnp = ap->a_cnp;
+
+ *vpp = NULLVP;
+ dirnode = VP_TO_TARFS_NODE(dvp);
+ parent = dirnode->parent;
+ tnp = NULL;
+
+ TARFS_DPF(LOOKUP, "%s(%p=%s, %.*s)\n", __func__,
+ dirnode, dirnode->name,
+ (int)cnp->cn_namelen, cnp->cn_nameptr);
+
+ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, curthread);
+ if (error != 0)
+ return (error);
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ /* Do not allow .. on the root node */
+ if (parent == NULL || parent == dirnode)
+ return (ENOENT);
+
+ /* Allocate a new vnode on the matching entry */
+ error = vn_vget_ino(dvp, parent->ino, cnp->cn_lkflags,
+ vpp);
+ if (error != 0)
+ return (error);
+ } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
+ VREF(dvp);
+ *vpp = dvp;
+#ifdef TARFS_DEBUG
+ } else if (dirnode == dirnode->tmp->root &&
+ (vp = dirnode->tmp->znode) != NULL &&
+ cnp->cn_namelen == TARFS_ZIO_NAMELEN &&
+ memcmp(cnp->cn_nameptr, TARFS_ZIO_NAME, TARFS_ZIO_NAMELEN) == 0) {
+ error = vn_lock(vp, cnp->cn_lkflags);
+ if (error != 0)
+ return (error);
+ vref(vp);
+ *vpp = vp;
+ return (0);
+#endif
+ } else {
+ tnp = tarfs_lookup_node(dirnode, NULL, cnp);
+ if (tnp == NULL) {
+ TARFS_DPF(LOOKUP, "%s(%p=%s, %.*s): file not found\n", __func__,
+ dirnode, dirnode->name,
+ (int)cnp->cn_namelen, cnp->cn_nameptr);
+ return (ENOENT);
+ }
+
+ if ((cnp->cn_flags & ISLASTCN) == 0 &&
+ (tnp->type != VDIR && tnp->type != VLNK))
+ return (ENOTDIR);
+
+ error = vn_vget_ino(dvp, tnp->ino, cnp->cn_lkflags, vpp);
+ if (error != 0)
+ return (error);
+ }
+
+#ifdef TARFS_DEBUG
+ if (tnp == NULL)
+ tnp = VP_TO_TARFS_NODE(*vpp);
+ TARFS_DPF(LOOKUP, "%s: found vnode %p, tarfs_node %p\n", __func__,
+ *vpp, tnp);
+#endif /* TARFS_DEBUG */
+
+ /* Store the result the the cache if MAKEENTRY is specified in flags */
+ if ((cnp->cn_flags & MAKEENTRY) != 0 && cnp->cn_nameiop != CREATE)
+ cache_enter(dvp, *vpp, cnp);
+
+ return (error);
+}
+
+static int
+tarfs_readdir(struct vop_readdir_args *ap)
+{
+ struct dirent cde;
+ struct tarfs_node *current, *tnp;
+ struct vnode *vp;
+ struct uio *uio;
+ int *eofflag;
+ u_long **cookies;
+ int *ncookies;
+ off_t off;
+ u_int idx, ndirents;
+ int error;
+
+ vp = ap->a_vp;
+ uio = ap->a_uio;
+ eofflag = ap->a_eofflag;
+ cookies = ap->a_cookies;
+ ncookies = ap->a_ncookies;
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ tnp = VP_TO_TARFS_NODE(vp);
+ off = uio->uio_offset;
+ current = NULL;
+ ndirents = 0;
+
+ TARFS_DPF(VNODE, "%s(%p=%s, %zu, %zd)\n", __func__,
+ tnp, tnp->name, uio->uio_offset, uio->uio_resid);
+
+ if (uio->uio_offset == TARFS_COOKIE_EOF) {
+ TARFS_DPF(VNODE, "%s: EOF\n", __func__);
+ return (0);
+ }
+
+ if (uio->uio_offset == TARFS_COOKIE_DOT) {
+ TARFS_DPF(VNODE, "%s: Generating . entry\n", __func__);
+ /* fake . entry */
+ cde.d_fileno = tnp->ino;
+ cde.d_type = DT_DIR;
+ cde.d_namlen = 1;
+ cde.d_name[0] = '.';
+ cde.d_name[1] = '\0';
+ cde.d_reclen = GENERIC_DIRSIZ(&cde);
+ if (cde.d_reclen > uio->uio_resid)
+ goto full;
+ error = uiomove(&cde, cde.d_reclen, uio);
+ if (error)
+ return (error);
+ /* next is .. */
+ uio->uio_offset = TARFS_COOKIE_DOTDOT;
+ ndirents++;
+ }
+
+ if (uio->uio_offset == TARFS_COOKIE_DOTDOT) {
+ TARFS_DPF(VNODE, "%s: Generating .. entry\n", __func__);
+ /* fake .. entry */
+ MPASS(tnp->parent != NULL);
+ TARFS_NODE_LOCK(tnp->parent);
+ cde.d_fileno = tnp->parent->ino;
+ TARFS_NODE_UNLOCK(tnp->parent);
+ cde.d_type = DT_DIR;
+ cde.d_namlen = 2;
+ cde.d_name[0] = '.';
+ cde.d_name[1] = '.';
+ cde.d_name[2] = '\0';
+ cde.d_reclen = GENERIC_DIRSIZ(&cde);
+ if (cde.d_reclen > uio->uio_resid)
+ goto full;
+ error = uiomove(&cde, cde.d_reclen, uio);
+ if (error)
+ return (error);
+ /* next is first child */
+ current = TAILQ_FIRST(&tnp->dir.dirhead);
+ if (current == NULL)
+ goto done;
+ uio->uio_offset = current->ino;
+ TARFS_DPF(VNODE, "%s: [%u] setting current node to %p=%s\n",
+ __func__, ndirents, current, current->name);
+ ndirents++;
+ }
+
+ /* resuming previous call */
+ if (current == NULL) {
+ current = tarfs_lookup_dir(tnp, uio->uio_offset);
+ if (current == NULL) {
+ error = EINVAL;
+ goto done;
+ }
+ uio->uio_offset = current->ino;
+ TARFS_DPF(VNODE, "%s: [%u] setting current node to %p=%s\n",
+ __func__, ndirents, current, current->name);
+ }
+
+ for (;;) {
+ cde.d_fileno = current->ino;
+ switch (current->type) {
+ case VBLK:
+ cde.d_type = DT_BLK;
+ break;
+ case VCHR:
+ cde.d_type = DT_CHR;
+ break;
+ case VDIR:
+ cde.d_type = DT_DIR;
+ break;
+ case VFIFO:
+ cde.d_type = DT_FIFO;
+ break;
+ case VLNK:
+ cde.d_type = DT_LNK;
+ break;
+ case VREG:
+ cde.d_type = DT_REG;
+ break;
+ default:
+ panic("%s: tarfs_node %p, type %d\n", __func__,
+ current, current->type);
+ }
+ cde.d_namlen = current->namelen;
+ MPASS(tnp->namelen < sizeof(cde.d_name));
+ (void)memcpy(cde.d_name, current->name, current->namelen);
+ cde.d_name[current->namelen] = '\0';
+ cde.d_reclen = GENERIC_DIRSIZ(&cde);
+ if (cde.d_reclen > uio->uio_resid)
+ goto full;
+ error = uiomove(&cde, cde.d_reclen, uio);
+ if (error != 0)
+ goto done;
+ ndirents++;
+ /* next sibling */
+ current = TAILQ_NEXT(current, dirents);
+ if (current == NULL)
+ goto done;
+ uio->uio_offset = current->ino;
+ TARFS_DPF(VNODE, "%s: [%u] setting current node to %p=%s\n",
+ __func__, ndirents, current, current->name);
+ }
+full:
+ if (cde.d_reclen > uio->uio_resid) {
+ TARFS_DPF(VNODE, "%s: out of space, returning\n",
+ __func__);
+ error = (ndirents == 0) ? EINVAL : 0;
+ }
+done:
+ TARFS_DPF(VNODE, "%s: %u entries written\n", __func__, ndirents);
+ TARFS_DPF(VNODE, "%s: saving cache information\n", __func__);
+ if (current == NULL) {
+ uio->uio_offset = TARFS_COOKIE_EOF;
+ tnp->dir.lastcookie = 0;
+ tnp->dir.lastnode = NULL;
+ } else {
+ tnp->dir.lastcookie = current->ino;
+ tnp->dir.lastnode = current;
+ }
+
+ if (eofflag != NULL) {
+ TARFS_DPF(VNODE, "%s: Setting EOF flag\n", __func__);
+ *eofflag = (error == 0 && current == NULL);
+ }
+
+ /* Update for NFS */
+ if (error == 0 && cookies != NULL && ncookies != NULL) {
+ TARFS_DPF(VNODE, "%s: Updating NFS cookies\n", __func__);
+ current = NULL;
+ *cookies = malloc(ndirents * sizeof(off_t), M_TEMP, M_WAITOK);
+ *ncookies = ndirents;
+ for (idx = 0; idx < ndirents; idx++) {
+ if (off == TARFS_COOKIE_DOT)
+ off = TARFS_COOKIE_DOTDOT;
+ else {
+ if (off == TARFS_COOKIE_DOTDOT) {
+ current = TAILQ_FIRST(&tnp->dir.dirhead);
+ } else if (current != NULL) {
+ current = TAILQ_NEXT(current, dirents);
+ } else {
+ current = tarfs_lookup_dir(tnp, off);
+ current = TAILQ_NEXT(current, dirents);
+ }
+ if (current == NULL)
+ off = TARFS_COOKIE_EOF;
+ else
+ off = current->ino;
+ }
+
+ TARFS_DPF(VNODE, "%s: [%u] offset %zu\n", __func__,
+ idx, off);
+ (*cookies)[idx] = off;
+ }
+ MPASS(uio->uio_offset == off);
+ }
+
+ return (error);
+}
+
+static int
+tarfs_read(struct vop_read_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct uio *uiop;
+ struct vnode *vp;
+ size_t len;
+ off_t resid;
+ int error;
+
+ uiop = ap->a_uio;
+ vp = ap->a_vp;
+
+ if (vp->v_type == VCHR || vp->v_type == VBLK)
+ return (EOPNOTSUPP);
+
+ if (vp->v_type != VREG)
+ return (EISDIR);
+
+ if (uiop->uio_offset < 0)
+ return (EINVAL);
+
+ tnp = VP_TO_TARFS_NODE(vp);
+ error = 0;
+
+ TARFS_DPF(VNODE, "%s(%p=%s, %zu, %zd)\n", __func__,
+ tnp, tnp->name, uiop->uio_offset, uiop->uio_resid);
+
+ while ((resid = uiop->uio_resid) > 0) {
+ if (tnp->size <= uiop->uio_offset)
+ break;
+ len = MIN(tnp->size - uiop->uio_offset, resid);
+ if (len == 0)
+ break;
+
+ error = tarfs_read_file(tnp, len, uiop);
+ if (error != 0 || resid == uiop->uio_resid)
+ break;
+ }
+
+ return (error);
+}
+
+static int
+tarfs_readlink(struct vop_readlink_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct uio *uiop;
+ struct vnode *vp;
+ int error;
+
+ uiop = ap->a_uio;
+ vp = ap->a_vp;
+
+ MPASS(uiop->uio_offset == 0);
+ MPASS(vp->v_type == VLNK);
+
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ TARFS_DPF(VNODE, "%s(%p=%s)\n", __func__,
+ tnp, tnp->name);
+
+ error = uiomove(tnp->link.name,
+ MIN(tnp->size, uiop->uio_resid), uiop);
+
+ return (error);
+}
+
+static int
+tarfs_reclaim(struct vop_reclaim_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct vnode *vp;
+
+ vp = ap->a_vp;
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ vfs_hash_remove(vp);
+ vnode_destroy_vobject(vp);
+ cache_purge(vp);
+
+ TARFS_NODE_LOCK(tnp);
+ tnp->vnode = NULLVP;
+ vp->v_data = NULL;
+ TARFS_NODE_UNLOCK(tnp);
+
+ return (0);
+}
+
+static int
+tarfs_print(struct vop_print_args *ap)
+{
+ struct tarfs_node *tnp;
+ struct vnode *vp;
+
+ vp = ap->a_vp;
+ tnp = VP_TO_TARFS_NODE(vp);
+
+ printf("tag tarfs, tarfs_node %p, links %lu\n",
+ tnp, tnp->nlink);
+ printf("\tmode 0%o, owner %d, group %d, size %zd\n",
+ tnp->mode, tnp->uid, tnp->gid,
+ tnp->size);
+
+ if (vp->v_type == VFIFO)
+ fifo_printinfo(vp);
+
+ printf("\n");
+
+ return (0);
+}
+
+static int
+tarfs_strategy(struct vop_strategy_args *ap)
+{
+ struct uio auio;
+ struct iovec iov;
+ struct tarfs_node *tnp;
+ struct buf *bp;
+ off_t off;
+ size_t len;
+ int error;
+
+ tnp = VP_TO_TARFS_NODE(ap->a_vp);
+ bp = ap->a_bp;
+ MPASS(bp->b_iocmd == BIO_READ);
+ MPASS(bp->b_iooffset >= 0);
+ MPASS(bp->b_bcount > 0);
+ MPASS(bp->b_bufsize >= bp->b_bcount);
+ TARFS_DPF(VNODE, "%s(%p=%s, %zu, %ld/%ld)\n", __func__, tnp,
+ tnp->name, (size_t)bp->b_iooffset, bp->b_bcount, bp->b_bufsize);
+ iov.iov_base = bp->b_data;
+ iov.iov_len = bp->b_bcount;
+ off = bp->b_iooffset;
+ len = bp->b_bcount;
+ bp->b_resid = len;
+ if (off > tnp->size) {
+ /* XXX read beyond EOF - figure out correct handling */
+ error = EIO;
+ goto out;
+ }
+ if (off + len > tnp->size) {
+ /* clip to file length */
+ len = tnp->size - off;
+ }
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = off;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = curthread;
+ error = tarfs_read_file(tnp, len, &auio);
+ bp->b_resid -= len - auio.uio_resid;
+out:
+ if (error != 0) {
+ bp->b_ioflags |= BIO_ERROR;
+ bp->b_error = error;
+ }
+ bp->b_flags |= B_DONE;
+ return (0);
+}
+
+static int
+tarfs_vptofh(struct vop_vptofh_args *ap)
+{
+ struct tarfs_fid *tfp;
+ struct tarfs_node *tnp;
+
+ tfp = (struct tarfs_fid *)ap->a_fhp;
+ tnp = VP_TO_TARFS_NODE(ap->a_vp);
+
+ tfp->len = sizeof(struct tarfs_fid);
+ tfp->ino = tnp->ino;
+ tfp->gen = tnp->gen;
+
+ return (0);
+}
+
+struct vop_vector tarfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+
+ .vop_access = tarfs_access,
+ .vop_cachedlookup = tarfs_lookup,
+ .vop_close = tarfs_close,
+ .vop_getattr = tarfs_getattr,
+ .vop_lookup = vfs_cache_lookup,
+ .vop_open = tarfs_open,
+ .vop_print = tarfs_print,
+ .vop_read = tarfs_read,
+ .vop_readdir = tarfs_readdir,
+ .vop_readlink = tarfs_readlink,
+ .vop_reclaim = tarfs_reclaim,
+ .vop_strategy = tarfs_strategy,
+ .vop_vptofh = tarfs_vptofh,
+};
+VFS_VOP_VECTOR_REGISTER(tarfs_vnodeops);
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index 33b1b506f85f..97f68c812a76 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -740,6 +740,12 @@ static struct witness_blessed blessed_list[] = {
* parent directory vnode is locked.
*/
{ "ufs", "bufwait" },
+
+ /*
+ * The tarfs decompression stream vnode may be locked while a
+ * buffer belonging to a tarfs data vnode is locked.
+ */
+ { "tarfs", "bufwait" },
};
/*
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 8c39c357ec5a..61bbdb2341a1 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -369,6 +369,7 @@ SUBDIR= \
sym \
${_syscons} \
sysvipc \
+ tarfs \
tcp \
${_ti} \
tmpfs \
diff --git a/sys/modules/tarfs/Makefile b/sys/modules/tarfs/Makefile
new file mode 100644
index 000000000000..369f17b3f643
--- /dev/null
+++ b/sys/modules/tarfs/Makefile
@@ -0,0 +1,23 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR:H:H}/fs/tarfs
+
+KMOD= tarfs
+SRCS= opt_tarfs.h \
+ vnode_if.h \
+ tarfs_io.c \
+ tarfs_subr.c \
+ tarfs_vnops.c \
+ tarfs_vfsops.c
+
+.if !defined(KERNBUILDDIR)
+CFLAGS+= -DZSTDIO
+.ifdef TARFS_DEBUG
+CFLAGS+= -DTARFS_DEBUG
+.endif
+.endif
+
+SRCS+= opt_zstdio.h
+CFLAGS+= -I${SRCTOP}/sys/contrib/zstd/lib/freebsd
+
+.include <bsd.kmod.mk>
diff --git a/tests/sys/fs/Makefile b/tests/sys/fs/Makefile
index 6769f2182e79..88822c640d8a 100644
--- a/tests/sys/fs/Makefile
+++ b/tests/sys/fs/Makefile
@@ -14,6 +14,7 @@ TESTSRC= ${SRCTOP}/contrib/netbsd-tests/fs
.if ${COMPILER_FEATURES:Mc++14} && ${MK_GOOGLETEST} != "no"
TESTS_SUBDIRS+= fusefs
.endif
+TESTS_SUBDIRS+= tarfs
TESTS_SUBDIRS+= tmpfs
${PACKAGE}FILES+= h_funcs.subr
diff --git a/tests/sys/fs/tarfs/Makefile b/tests/sys/fs/tarfs/Makefile
new file mode 100644
index 000000000000..b16c6544d33f
--- /dev/null
+++ b/tests/sys/fs/tarfs/Makefile
@@ -0,0 +1,10 @@
+PACKAGE= tests
+
+TESTSDIR= ${TESTSBASE}/sys/fs/tarfs
+BINDIR= ${TESTSDIR}
+
+PROGS+= mktar
+
+ATF_TESTS_SH+= tarfs_test
+
+.include <bsd.test.mk>
diff --git a/tests/sys/fs/tarfs/mktar.c b/tests/sys/fs/tarfs/mktar.c
new file mode 100644
index 000000000000..e1b1183af114
--- /dev/null
+++ b/tests/sys/fs/tarfs/mktar.c
@@ -0,0 +1,238 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include <err.h>
+#include <fcntl.h>
+#include <paths.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#define PROGNAME "mktar"
+
+#define SUBDIRNAME "directory"
+#define SPARSEFILENAME "sparse_file"
+#define HARDLINKNAME "hard_link"
+#define SHORTLINKNAME "short_link"
+#define LONGLINKNAME "long_link"
+
+static bool opt_v;
+
+static void verbose(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!opt_v)
+ return;
+ fprintf(stderr, "%s: ", PROGNAME);
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ fprintf(stderr, "\n");
+}
+
+static void
+mksparsefile(const char *filename, mode_t mode)
+{
+ char buf[511];
+ ssize_t res;
+ int fd;
+
+ if ((fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, mode)) < 0)
+ err(1, "%s", filename);
+ for (unsigned int i = 33; i <= 126; i++) {
+ memset(buf, i, sizeof(buf));
+ if (lseek(fd, 1048576LU * (i - 32), SEEK_SET) < 0)
+ err(1, "%s", filename);
+ res = write(fd, buf, sizeof(buf));
+ if (res < 0)
+ err(1, "%s", filename);
+ if (res != sizeof(buf))
+ errx(1, "%s: short write", filename);
+ }
+ close(fd);
+}
+
+static char *
+mklonglinktarget(const char *dirname, const char *filename)
+{
+ char *piece, *target;
+
+ if (asprintf(&piece, "%1$s/../%1$s/../%1$s/../%1$s/../", dirname) < 0)
+ err(1, "asprintf()");
+ if (asprintf(&target, "%1$s%1$s%1$s%1$s%1$s%1$s%1$s%1$s%2$s", piece, filename) < 0)
+ err(1, "asprintf()");
+ free(piece);
+ return target;
+}
+
+static void
+mktar(void)
+{
+ char *linktarget;
+
+ /* create a subdirectory */
+ verbose("mkdir %s", SUBDIRNAME);
+ if (mkdir(SUBDIRNAME, 0755) != 0)
+ err(1, "%s", SUBDIRNAME);
+
+ /* create a sparse file */
+ verbose("creating %s", SPARSEFILENAME);
+ mksparsefile(SPARSEFILENAME, 0644);
+ chflags(SPARSEFILENAME, UF_NODUMP);
+
+ /* create a hard link */
+ verbose("link %s %s", SPARSEFILENAME, HARDLINKNAME);
+ if (link(SPARSEFILENAME, HARDLINKNAME) != 0)
+ err(1, "%s", HARDLINKNAME);
+
+ /* create a symbolic link with a short target */
+ verbose("symlink %s %s", SPARSEFILENAME, SHORTLINKNAME);
+ if (symlink(SPARSEFILENAME, SHORTLINKNAME) != 0)
+ err(1, "%s", SHORTLINKNAME);
+
+ /* create a symbolic link with a long target */
+ linktarget = mklonglinktarget(SUBDIRNAME, SPARSEFILENAME);
+ verbose("symlink %s %s", linktarget, LONGLINKNAME);
+ if (symlink(linktarget, LONGLINKNAME) != 0)
+ err(1, "%s", LONGLINKNAME);
+ free(linktarget);
+}
+
+static void
+usage(void)
+{
+
+ fprintf(stderr, "usage: %s [-v] tarfile\n", PROGNAME);
+ exit(EXIT_FAILURE);
+}
+
+int
+main(int argc, char *argv[])
+{
+ const char *tarfilename;
+ char *dirname;
+ int opt, wstatus;
+ pid_t pid;
+
+ while ((opt = getopt(argc, argv, "v")) != -1)
+ switch (opt) {
+ case 'v':
+ opt_v = true;
+ break;
+ default:
+ usage();
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage();
+ tarfilename = *argv;
+
+ if (asprintf(&dirname, "%s%s.XXXXXXXX", _PATH_TMP, PROGNAME) < 0)
+ err(1, "asprintf()");
+ if (mkdtemp(dirname) == NULL)
+ err(1, "%s", dirname);
+ verbose("mkdir %s", dirname);
+
+ /* fork a child to create the files */
+ if ((pid = fork()) < 0)
+ err(1, "fork()");
+ if (pid == 0) {
+ verbose("cd %s", dirname);
+ if (chdir(dirname) != 0)
+ err(1, "%s", dirname);
+ verbose("umask 022");
+ umask(022);
+ mktar();
+ verbose("cd -");
+ exit(0);
+ }
+ if (waitpid(pid, &wstatus, 0) < 0)
+ err(1, "waitpid()");
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0)
+ errx(1, "child failed");
+
+ /* fork a child to create the tarball */
+ if ((pid = fork()) < 0)
+ err(1, "fork()");
+ if (pid == 0) {
+ verbose("creating tarball");
+ execlp("tar", "tar",
+ "-c",
+ "-f", tarfilename,
+ "-C", dirname,
+ "--zstd",
+#if 0
+ "--options", "zstd:frame-per-file",
+#endif
+ ".",
+ NULL);
+ err(1, "execlp()");
+ }
+ if (waitpid(pid, &wstatus, 0) < 0)
+ err(1, "waitpid()");
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0)
+ errx(1, "child failed");
+
+ /* fork a child to delete everything */
+ if ((pid = fork()) < 0)
+ err(1, "fork()");
+ if (pid == 0) {
+ verbose("cd %s", dirname);
+ if (chdir(dirname) != 0)
+ err(1, "%s", dirname);
+ verbose("rm %s", LONGLINKNAME);
+ (void)unlink(LONGLINKNAME);
+ verbose("rm %s", SHORTLINKNAME);
+ (void)unlink(SHORTLINKNAME);
+ verbose("rm %s", HARDLINKNAME);
+ (void)unlink(HARDLINKNAME);
+ verbose("rm %s", SPARSEFILENAME);
+ (void)unlink(SPARSEFILENAME);
+ verbose("rm %s", SUBDIRNAME);
+ (void)rmdir(SUBDIRNAME);
+ verbose("cd -");
+ exit(0);
+ }
+ if (waitpid(pid, &wstatus, 0) < 0)
+ err(1, "waitpid()");
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0)
+ errx(1, "child failed");
+ verbose("rmdir %s", dirname);
+ (void)rmdir(dirname);
+
+ exit(0);
+}
diff --git a/tests/sys/fs/tarfs/tarfs_test.sh b/tests/sys/fs/tarfs/tarfs_test.sh
new file mode 100644
index 000000000000..d812ced80bbb
--- /dev/null
+++ b/tests/sys/fs/tarfs/tarfs_test.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+#-
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# Copyright (c) 2023 Klara, Inc.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+mktar="$(dirname $(realpath "$0"))"/mktar
+mnt="$(realpath ${TMPDIR:-/tmp})/mnt.$$"
+
+# expected SHA256 checksum of file contained in test tarball
+sum=4da2143234486307bb44eaa610375301781a577d1172f362b88bb4b1643dee62
+
+atf_test_case tarfs_test
+tarfs_test_head() {
+ atf_set "require.user" "root"
+}
+tarfs_test_body() {
+ mkdir "${mnt}"
+ "${mktar}" tarfs_test.tar.zst
+ atf_check mount -rt tarfs tarfs_test.tar.zst "${mnt}"
+ atf_check_equal "$(stat -f%d,%i "${mnt}"/sparse_file)" "$(stat -f%d,%i "${mnt}"/hard_link)"
+ atf_check_equal "$(stat -f%d,%i "${mnt}"/sparse_file)" "$(stat -L -f%d,%i "${mnt}"/short_link)"
+ atf_check_equal "$(stat -f%d,%i "${mnt}"/sparse_file)" "$(stat -L -f%d,%i "${mnt}"/long_link)"
+ atf_check_equal "$(sha256 -q "${mnt}"/sparse_file)" ${sum}
+}
+tarfs_test_cleanup() {
+ umount "${mnt}"
+}
+
+atf_init_test_cases() {
+ atf_add_test_case tarfs_test
+}