aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKirk McKusick <mckusick@FreeBSD.org>2022-03-01 00:36:08 +0000
committerKirk McKusick <mckusick@FreeBSD.org>2022-03-01 00:36:08 +0000
commitc7996ddf8000cfb19a9e91a636f56747860d03d7 (patch)
treecc60ad9cc603a0af013c9fb24e5fc9b4f1562c73
parent2062ce996ddd39ba7a36c2caf8c898686d9cb2fe (diff)
downloadsrc-c7996ddf8000cfb19a9e91a636f56747860d03d7.tar.gz
src-c7996ddf8000cfb19a9e91a636f56747860d03d7.zip
Create a new GEOM utility, gunion(8).
The gunion(8) utility is used to track changes to a read-only disk on a writable disk. Logically, a writable disk is placed over a read-only disk. Write requests are intercepted and stored on the writable disk. Read requests are first checked to see if they have been written on the top (writable disk) and if found are returned. If they have not been written on the top disk, then they are read from the lower disk. The gunion(8) utility can be especially useful if you have a large disk with a corrupted filesystem that you are unsure of how to repair. You can use gunion(8) to place another disk over the corrupted disk and then attempt to repair the filesystem. If the repair fails, you can revert all the changes in the upper disk and be back to the unchanged state of the lower disk thus allowing you to try another approach to repairing it. If the repair is successful you can commit all the writes recorded on the top disk to the lower disk. Another use of the gunion(8) utility is to try out upgrades to your system. Place the upper disk over the disk holding your filesystem that is to be upgraded and then run the upgrade on it. If it works, commit it; if it fails, revert the upgrade. Further details can be found in the gunion(8) manual page. Reviewed by: Chuck Silvers, kib (earlier version) tested by: Peter Holm Differential Revision: https://reviews.freebsd.org/D32697
-rw-r--r--etc/mtree/BSD.include.dist2
-rw-r--r--include/Makefile2
-rw-r--r--lib/geom/Makefile.classes1
-rw-r--r--lib/geom/union/Makefile8
-rw-r--r--lib/geom/union/Makefile.depend19
-rw-r--r--lib/geom/union/geom_union.c83
-rw-r--r--lib/geom/union/gunion.8320
-rw-r--r--sbin/geom/core/geom.85
-rw-r--r--sys/conf/files1
-rw-r--r--sys/geom/union/g_union.c1389
-rw-r--r--sys/geom/union/g_union.h144
-rw-r--r--sys/modules/geom/Makefile1
-rw-r--r--sys/modules/geom/geom_union/Makefile8
13 files changed, 1981 insertions, 2 deletions
diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist
index 833618b190fd..7679fd528918 100644
--- a/etc/mtree/BSD.include.dist
+++ b/etc/mtree/BSD.include.dist
@@ -182,6 +182,8 @@
..
stripe
..
+ union
+ ..
virstor
..
..
diff --git a/include/Makefile b/include/Makefile
index 76f713daf62c..42590b900442 100644
--- a/include/Makefile
+++ b/include/Makefile
@@ -51,7 +51,7 @@ LSUBDIRS= dev/acpica dev/agp dev/ciss dev/filemon dev/firewire \
fs/procfs fs/smbfs fs/udf fs/unionfs \
geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
geom/mirror geom/mountver geom/multipath geom/nop \
- geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
+ geom/raid geom/raid3 geom/shsec geom/stripe geom/union geom/virstor \
net/altq \
net/route \
netgraph/atm netgraph/netflow \
diff --git a/lib/geom/Makefile.classes b/lib/geom/Makefile.classes
index fcaa748825ee..d4e6f52e65ae 100644
--- a/lib/geom/Makefile.classes
+++ b/lib/geom/Makefile.classes
@@ -22,4 +22,5 @@ GEOM_CLASSES+= raid
GEOM_CLASSES+= raid3
GEOM_CLASSES+= shsec
GEOM_CLASSES+= stripe
+GEOM_CLASSES+= union
GEOM_CLASSES+= virstor
diff --git a/lib/geom/union/Makefile b/lib/geom/union/Makefile
new file mode 100644
index 000000000000..cb8b09dc7eca
--- /dev/null
+++ b/lib/geom/union/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+PACKAGE=runtime
+.PATH: ${.CURDIR:H:H}/misc
+
+GEOM_CLASS= union
+
+.include <bsd.lib.mk>
diff --git a/lib/geom/union/Makefile.depend b/lib/geom/union/Makefile.depend
new file mode 100644
index 000000000000..fb5f86e931fb
--- /dev/null
+++ b/lib/geom/union/Makefile.depend
@@ -0,0 +1,19 @@
+# $FreeBSD$
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+ gnu/lib/csu \
+ include \
+ include/xlocale \
+ lib/${CSU_DIR} \
+ lib/libc \
+ lib/libcompiler_rt \
+ lib/libgeom \
+ sbin/geom/core \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif
diff --git a/lib/geom/union/geom_union.c b/lib/geom/union/geom_union.c
new file mode 100644
index 000000000000..2e0843d35899
--- /dev/null
+++ b/lib/geom/union/geom_union.c
@@ -0,0 +1,83 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Marshall Kirk McKusick <mckusick@mckusick.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <libgeom.h>
+#include <geom/union/g_union.h>
+
+#include "core/geom.h"
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_UNION_VERSION;
+
+struct g_command class_commands[] = {
+ { "create", G_FLAG_LOADKLD, NULL,
+ {
+ { 'o', "offset", "0", G_TYPE_NUMBER },
+ { 's', "size", "0", G_TYPE_NUMBER },
+ { 'S', "secsize", "0", G_TYPE_NUMBER },
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ { 'Z', "gunionname", G_VAL_OPTIONAL, G_TYPE_STRING },
+ G_OPT_SENTINEL
+ },
+ "[-v] [-o offset] [-s size] [-S secsize] [-Z gunionname] "
+ "upperdev lowerdev"
+ },
+ { "destroy", 0, NULL,
+ {
+ { 'f', "force", NULL, G_TYPE_BOOL },
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-fv] prov ..."
+ },
+ { "reset", 0, NULL,
+ {
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-v] prov ..."
+ },
+ { "commit", 0, NULL,
+ {
+ { 'f', "force", NULL, G_TYPE_BOOL },
+ { 'r', "reboot", NULL, G_TYPE_BOOL },
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-frv] prov ..."
+ },
+ { "revert", 0, NULL,
+ {
+ { 'v', "verbose", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ },
+ "[-v] prov ..."
+ },
+ G_CMD_SENTINEL
+};
diff --git a/lib/geom/union/gunion.8 b/lib/geom/union/gunion.8
new file mode 100644
index 000000000000..732a803657d8
--- /dev/null
+++ b/lib/geom/union/gunion.8
@@ -0,0 +1,320 @@
+.\"
+.\" Copyright (c) 2022 Marshall Kirk McKusick <mckusick@mckusick.com>
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 19, 2022
+.Dt GUNION 8
+.Os
+.Sh NAME
+.Nm gunion
+.Nd "control utility for UNION GEOM class"
+.Sh SYNOPSIS
+.Nm
+.Cm create
+.Op Fl v
+.Op Fl o Ar offset
+.Op Fl s Ar size
+.Op Fl S Ar secsize
+.Op Fl Z Ar gunionname
+.Ar upperdev lowerdev
+.Nm
+.Cm destroy
+.Op Fl fv
+.Ar prov ...
+.Nm
+.Cm reset
+.Op Fl v
+.Ar prov ...
+.Nm
+.Cm revert
+.Op Fl v
+.Ar prov ...
+.Nm
+.Cm commit
+.Op Fl frv
+.Ar prov ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used to track changes to a read-only disk on a writable disk.
+Logically, a writable disk is placed over a read-only disk.
+Write requests are intercepted and stored on the writable disk.
+Read requests are first checked to see if they have been written
+on the top (writable disk) and if found are returned.
+If they have not been written on the top disk,
+then they are read from the lower disk.
+.Pp
+The
+.Nm
+utility can be especially useful if you have a large disk with a
+corrupted filesystem that you are unsure of how to repair.
+You can use
+.Nm
+to place another disk over the corrupted disk and then attempt
+to repair the filesystem.
+If the repair fails, you can revert all the changes in the upper disk
+and be back to the unchanged state of the lower disk thus allowing you
+to try another approach to repairing it.
+If the repair is successful you can request that all the writes recorded
+on the top disk be written to the lower disk.
+.Pp
+Another use of the
+.Nm
+utility is to try out upgrades to your system.
+Place the upper disk over the disk holding your filesystem that
+is to be upgraded and then run the upgrade on it.
+If it works, commit it;
+if it fails, revert the upgrade.
+An example is given below.
+.Pp
+The upper disk must be at least the size of the disk that it covers.
+The union metadata exists only for the
+period of time that the union is instantiated,
+so it is important to commit the updates before destroying the union.
+If the top disk is about 2.5 percent larger for 512 byte sector disks
+(or 0.5 percent larger for 4K sector disks) than the disk that it covers,
+it is posible (thought not currently implemented) to save the union
+metadata between instantiations of the union device.
+.Pp
+If you do not have physical media available to use for the upper layer, the
+.Xr md 4
+disk can be used instead.
+When used in
+.Cm swap
+mode the changes are all held in buffer memory.
+Pages get pushed out to the swap when the system is under memory pressure,
+otherwise they stay in the operating memory.
+If long-term persistance is desired,
+.Cm vnode
+mode can be used in which a regular file is used as backing store.
+The disk space used by the file is based on the amount of data that
+is written to the top device.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width "destroy"
+.It Cm create
+Set up a union provider on the two given devices.
+The first device given is used as the top device and must be writable.
+The second device given is used as the bottom device and need only be readable.
+The second device may be mounted read-only but it is recommended
+that it be unmounted and accessed only through a mount of the union device.
+If the operation succeeds, the new provider should appear with name
+.Pa /dev/ Ns Ao Ar upperdev Ac Ns - Ns Ao Ar lowerdev Ac Ns Pa .union .
+An alternate name can be specified with the
+.Fl Z
+flag.
+The kernel module
+.Pa geom_union.ko
+will be loaded if it is not loaded already.
+.Pp
+Additional options include:
+.Bl -tag -width "-Z gunionname"
+.It Fl o Ar offset
+Where to begin on the original provider.
+The default is to start at the beginning of the disk (i.e., at offset 0).
+This option may be used to skip over partitioning information stored
+at the beginning of a disk.
+The offset must be a multiple of the sector size.
+.It Fl s Ar size
+Size of the transparent provider.
+The default is to be the same size as the lower disk.
+Any extra space at the end of the upper disk may be used to store
+union metadata.
+.It Fl S Ar secsize
+Sector size of the transparent provider.
+The default is to be the same sector size as the lower disk.
+.It Fl v
+Be more verbose.
+.It Fl Z Ar gunionname
+The name of the new provider.
+The suffix
+.Dq .union
+will be appended to the provider name.
+.El
+.It Cm destroy
+Turn off the given union providers.
+.Pp
+Additional options include:
+.Bl -tag -width "-f"
+.It Fl f
+Force the removal of the specified provider.
+.It Fl v
+Be more verbose.
+.El
+.It Cm revert
+Discard all the changes made in the top layer thus reverting to the
+original state of the lower device.
+The union device may not be mounted or otherwise in use when a
+.Cm revert
+operation is being done.
+.It Cm commit
+Write all the changes made in the top device to the lower device
+thus committing the lower device to have the same data as the union.
+.Pp
+Additional options include:
+.Bl -tag -width "-f"
+.It Fl f
+The
+.Cm commit
+command will not allow the lower device to be mounted
+or otherwise in use while the
+.Cm commit
+operation is being done.
+However, the
+.Fl f
+flag may be specified to allow the lower device to be mounted read-only.
+To prevent a filesystem panic on the mounted lower-device filesystem,
+immediately after the
+.Cm commit
+operation finishes the lower-device filesystem should be unmounted
+and then remounted to update its metadata state.
+If the lower-device filesystem is currently being used as the root
+filesystem then the
+.Fl r
+flag should be specified to reboot the system at the completion of the
+.Cm commit
+operation.
+.It Fl r
+Reboot the system at the completion of the
+.Cm commit
+operation.
+.It Fl v
+Be more verbose.
+.El
+.It Cm reset
+Reset statistics for the given union providers.
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, and 1 if the command fails.
+.Sh EXAMPLES
+The following example shows how to create and destroy a
+union provider with disks
+.Pa /dev/da0p1
+as the read-only disk on the bottom and
+.Pa /dev/md0
+as the wriable disk on the top.
+.Bd -literal -offset indent
+gunion create -v md0 da0p1
+mount /dev/md0-da0p1.union /mnt
+.Ed
+.Pp
+Proceed to make changes in /mnt filesystem.
+If they are successful and you want to keep them.
+.Bd -literal -offset indent
+umount /mnt
+gunion commit -v md0-da0p1.union
+.Ed
+.Pp
+If they are unsuccessful and you want to roll back.
+.Bd -literal -offset indent
+umount /mnt
+gunion revert -v md0-da0p1.union
+.Ed
+.Pp
+When done eliminate the union.
+.Bd -literal -offset indent
+umount /mnt
+gunion destroy -v md0-da0p1.union
+.Ed
+.Pp
+All uncommitted changes will be discarded when the union is destroyed.
+.Pp
+If you use the name of the full disk, for example
+.Pa da0
+and it is labelled,
+then a union name will appear for the disk as
+.Pa md0-da0.union
+as well as for each partition on the disk as
+.Pa md0-da0p1.union ,
+.Pa md0-da0p2.union ,
+etc.
+A commit operation can be done only on
+.Pa md0-da0.union
+and will commit changes to all the partitions.
+If partition level commits are desired,
+then a union must be created for each partition.
+.Pp
+The traffic statistics for the given
+union providers can be obtained with the
+.Cm list
+command.
+The example below shows the number of bytes written with
+.Xr newfs 8 :
+.Bd -literal -offset indent
+gunion create md0 da0p1
+newfs /dev/md0-da0p1.union
+gunion list
+.Ed
+.Sh SYSCTL VARIABLES
+The following
+.Xr sysctl 8
+variables can be used to control the behavior of the
+.Nm UNION
+GEOM class.
+The default value is shown next to each variable.
+.Bl -tag -width indent
+.It Va kern.geom.union.debug : No 0
+Debug level of the
+.Nm UNION
+GEOM class.
+This can be set to a number between 0 and 4 inclusive.
+If set to 0, no debug information is printed.
+If set to 1, all the verbose messages are logged.
+If set to 2, addition error-related information is logged.
+If set to 3, mapping operations are logged.
+If set to 4, the maximum amount of debug information is printed.
+.El
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 14.0 .
+.Sh AUTHORS
+.An Marshall Kirk McKusick Aq Mt mckusick@mckusick.com
diff --git a/sbin/geom/core/geom.8 b/sbin/geom/core/geom.8
index 298fc2b1d4fd..db0556fb9505 100644
--- a/sbin/geom/core/geom.8
+++ b/sbin/geom/core/geom.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd September 14, 2018
+.Dd January 19, 2022
.Dt GEOM 8
.Os
.Sh NAME
@@ -162,6 +162,8 @@ SHSEC
.It
STRIPE
.It
+UNION
+.It
VIRSTOR
.El
.Sh ENVIRONMENT
@@ -210,6 +212,7 @@ geom md unload
.Xr gsched 8 ,
.Xr gshsec 8 ,
.Xr gstripe 8 ,
+.Xr gunion 8 ,
.Xr gvirstor 8
.Sh HISTORY
The
diff --git a/sys/conf/files b/sys/conf/files
index 663441d3adfb..74f15f867213 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3689,6 +3689,7 @@ geom/raid3/g_raid3.c optional geom_raid3
geom/raid3/g_raid3_ctl.c optional geom_raid3
geom/shsec/g_shsec.c optional geom_shsec
geom/stripe/g_stripe.c optional geom_stripe
+geom/union/g_union.c optional geom_union
geom/uzip/g_uzip.c optional geom_uzip
geom/uzip/g_uzip_lzma.c optional geom_uzip
geom/uzip/g_uzip_wrkthr.c optional geom_uzip
diff --git a/sys/geom/union/g_union.c b/sys/geom/union/g_union.c
new file mode 100644
index 000000000000..a5702d175264
--- /dev/null
+++ b/sys/geom/union/g_union.c
@@ -0,0 +1,1389 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Marshall Kirk McKusick <mckusick@mckusick.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/ctype.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+
+#include <geom/geom.h>
+#include <geom/geom_dbg.h>
+#include <geom/union/g_union.h>
+
+SYSCTL_DECL(_kern_geom);
+static SYSCTL_NODE(_kern_geom, OID_AUTO, union, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "GEOM_UNION stuff");
+static u_int g_union_debug = 0;
+SYSCTL_UINT(_kern_geom_union, OID_AUTO, debug, CTLFLAG_RW, &g_union_debug, 0,
+ "Debug level");
+
+static void g_union_config(struct gctl_req *req, struct g_class *mp,
+ const char *verb);
+static g_access_t g_union_access;
+static g_start_t g_union_start;
+static g_dumpconf_t g_union_dumpconf;
+static g_orphan_t g_union_orphan;
+static int g_union_destroy_geom(struct gctl_req *req, struct g_class *mp,
+ struct g_geom *gp);
+static g_provgone_t g_union_providergone;
+static g_resize_t g_union_resize;
+
+struct g_class g_union_class = {
+ .name = G_UNION_CLASS_NAME,
+ .version = G_VERSION,
+ .ctlreq = g_union_config,
+ .access = g_union_access,
+ .start = g_union_start,
+ .dumpconf = g_union_dumpconf,
+ .orphan = g_union_orphan,
+ .destroy_geom = g_union_destroy_geom,
+ .providergone = g_union_providergone,
+ .resize = g_union_resize,
+};
+
+static void g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool);
+static intmax_t g_union_fetcharg(struct gctl_req *req, const char *name);
+static bool g_union_verify_nprefix(const char *name);
+static void g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool);
+static struct g_geom *g_union_find_geom(struct g_class *mp, const char *name);
+static void g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool);
+static void g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool);
+static void g_union_revert(struct g_union_softc *sc);
+static void g_union_doio(struct g_union_wip *wip);
+static void g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool);
+static void g_union_setmap(struct bio *bp, struct g_union_softc *sc);
+static bool g_union_getmap(struct bio *bp, struct g_union_softc *sc,
+ off_t *len2read);
+static void g_union_done(struct bio *bp);
+static void g_union_kerneldump(struct bio *bp, struct g_union_softc *sc);
+static int g_union_dumper(void *, void *, vm_offset_t, off_t, size_t);
+static int g_union_destroy(struct gctl_req *req, struct g_geom *gp, bool force);
+
+/*
+ * Operate on union-specific configuration commands.
+ */
+static void
+g_union_config(struct gctl_req *req, struct g_class *mp, const char *verb)
+{
+ uint32_t *version, *verbose;
+
+ g_topology_assert();
+
+ version = gctl_get_paraml(req, "version", sizeof(*version));
+ if (version == NULL) {
+ gctl_error(req, "No '%s' argument.", "version");
+ return;
+ }
+ if (*version != G_UNION_VERSION) {
+ gctl_error(req, "Userland and kernel parts are out of sync.");
+ return;
+ }
+ verbose = gctl_get_paraml(req, "verbose", sizeof(*verbose));
+ if (verbose == NULL) {
+ gctl_error(req, "No '%s' argument.", "verbose");
+ return;
+ }
+ if (strcmp(verb, "create") == 0) {
+ g_union_ctl_create(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "destroy") == 0) {
+ g_union_ctl_destroy(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "reset") == 0) {
+ g_union_ctl_reset(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "revert") == 0) {
+ g_union_ctl_revert(req, mp, *verbose);
+ return;
+ } else if (strcmp(verb, "commit") == 0) {
+ g_union_ctl_commit(req, mp, *verbose);
+ return;
+ }
+
+ gctl_error(req, "Unknown verb.");
+}
+
+/*
+ * Create a union device.
+ */
+static void
+g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_provider *upperpp, *lowerpp, *newpp;
+ struct g_consumer *uppercp, *lowercp;
+ struct g_union_softc *sc;
+ struct g_geom_alias *gap;
+ struct g_geom *gp;
+ intmax_t offset, secsize, size, needed;
+ const char *gunionname;
+ int *nargs, error, i, n;
+ char name[64];
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ if (*nargs < 2) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+ if (*nargs > 2) {
+ gctl_error(req, "Extra device(s).");
+ return;
+ }
+
+ offset = g_union_fetcharg(req, "offset");
+ size = g_union_fetcharg(req, "size");
+ secsize = g_union_fetcharg(req, "secsize");
+ gunionname = gctl_get_asciiparam(req, "gunionname");
+
+ upperpp = gctl_get_provider(req, "arg0");
+ lowerpp = gctl_get_provider(req, "arg1");
+ if (upperpp == NULL || lowerpp == NULL)
+ /* error message provided by gctl_get_provider() */
+ return;
+ /* Create the union */
+ if (secsize == 0)
+ secsize = lowerpp->sectorsize;
+ else if ((secsize % lowerpp->sectorsize) != 0) {
+ gctl_error(req, "Sector size %jd is not a multiple of lower "
+ "provider %s's %jd sector size.", (intmax_t)secsize,
+ lowerpp->name, (intmax_t)lowerpp->sectorsize);
+ return;
+ }
+ if (secsize > maxphys) {
+ gctl_error(req, "Too big secsize %jd for lower provider %s.",
+ (intmax_t)secsize, lowerpp->name);
+ return;
+ }
+ if (secsize % upperpp->sectorsize != 0) {
+ gctl_error(req, "Sector size %jd is not a multiple of upper "
+ "provider %s's %jd sector size.", (intmax_t)secsize,
+ upperpp->name, (intmax_t)upperpp->sectorsize);
+ return;
+ }
+ if ((offset % secsize) != 0) {
+ gctl_error(req, "Offset %jd is not a multiple of lower "
+ "provider %s's %jd sector size.", (intmax_t)offset,
+ lowerpp->name, (intmax_t)lowerpp->sectorsize);
+ return;
+ }
+ if (size == 0)
+ size = lowerpp->mediasize - offset;
+ else
+ size -= offset;
+ if ((size % secsize) != 0) {
+ gctl_error(req, "Size %jd is not a multiple of sector size "
+ "%jd.", (intmax_t)size, (intmax_t)secsize);
+ return;
+ }
+ if (offset + size < lowerpp->mediasize) {
+ gctl_error(req, "Size %jd is too small for lower provider %s, "
+ "needs %jd.", (intmax_t)(offset + size), lowerpp->name,
+ lowerpp->mediasize);
+ return;
+ }
+ if (size > upperpp->mediasize) {
+ gctl_error(req, "Upper provider %s size (%jd) is too small, "
+ "needs %jd.", upperpp->name, (intmax_t)upperpp->mediasize,
+ (intmax_t)size);
+ return;
+ }
+ if (gunionname != NULL && !g_union_verify_nprefix(gunionname)) {
+ gctl_error(req, "Gunion name %s must be alphanumeric.",
+ gunionname);
+ return;
+ }
+ if (gunionname != NULL) {
+ n = snprintf(name, sizeof(name), "%s%s", gunionname,
+ G_UNION_SUFFIX);
+ } else {
+ n = snprintf(name, sizeof(name), "%s-%s%s", upperpp->name,
+ lowerpp->name, G_UNION_SUFFIX);
+ }
+ if (n <= 0 || n >= sizeof(name)) {
+ gctl_error(req, "Invalid provider name.");
+ return;
+ }
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (strcmp(gp->name, name) == 0) {
+ gctl_error(req, "Provider %s already exists.", name);
+ return;
+ }
+ }
+ gp = g_new_geomf(mp, "%s", name);
+ sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
+ rw_init(&sc->sc_rwlock, "gunion");
+ TAILQ_INIT(&sc->sc_wiplist);
+ sc->sc_offset = offset;
+ sc->sc_size = size;
+ sc->sc_sectorsize = secsize;
+ sc->sc_reads = 0;
+ sc->sc_writes = 0;
+ sc->sc_deletes = 0;
+ sc->sc_getattrs = 0;
+ sc->sc_flushes = 0;
+ sc->sc_speedups = 0;
+ sc->sc_cmd0s = 0;
+ sc->sc_cmd1s = 0;
+ sc->sc_cmd2s = 0;
+ sc->sc_readbytes = 0;
+ sc->sc_wrotebytes = 0;
+ sc->sc_writemap_memory = 0;
+ gp->softc = sc;
+
+ newpp = g_new_providerf(gp, "%s", gp->name);
+ newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
+ newpp->mediasize = size;
+ newpp->sectorsize = secsize;
+ LIST_FOREACH(gap, &upperpp->aliases, ga_next)
+ g_provider_add_alias(newpp, "%s%s", gap->ga_alias,
+ G_UNION_SUFFIX);
+ LIST_FOREACH(gap, &lowerpp->aliases, ga_next)
+ g_provider_add_alias(newpp, "%s%s", gap->ga_alias,
+ G_UNION_SUFFIX);
+ lowercp = g_new_consumer(gp);
+ lowercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ if ((error = g_attach(lowercp, lowerpp)) != 0) {
+ gctl_error(req, "Error %d: cannot attach to provider %s.",
+ error, lowerpp->name);
+ goto fail1;
+ }
+ /* request read and exclusive access for lower */
+ if ((error = g_access(lowercp, 1, 0, 1)) != 0) {
+ gctl_error(req, "Error %d: cannot obtain exclusive access to "
+ "%s.\n\tMust be unmounted or mounted read-only.", error,
+ lowerpp->name);
+ goto fail2;
+ }
+ uppercp = g_new_consumer(gp);
+ uppercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ if ((error = g_attach(uppercp, upperpp)) != 0) {
+ gctl_error(req, "Error %d: cannot attach to provider %s.",
+ error, upperpp->name);
+ goto fail3;
+ }
+ /* request read, write, and exclusive access for upper */
+ if ((error = g_access(uppercp, 1, 1, 1)) != 0) {
+ gctl_error(req, "Error %d: cannot obtain write access to %s.",
+ error, upperpp->name);
+ goto fail4;
+ }
+ sc->sc_uppercp = uppercp;
+ sc->sc_lowercp = lowercp;
+
+ newpp->flags |= (upperpp->flags & G_PF_ACCEPT_UNMAPPED) &
+ (lowerpp->flags & G_PF_ACCEPT_UNMAPPED);
+ g_error_provider(newpp, 0);
+ /*
+ * Allocate the map that tracks the sectors that have been written
+ * to the top layer. We use a 2-level hierarchy as that lets us
+ * map up to 1 petabyte using allocations of less than 33 Mb
+ * when using 4K byte sectors (or 268 Mb with 512 byte sectors).
+ *
+ * We totally populate the leaf nodes rather than allocating them
+ * as they are first used because their usage occurs in the
+ * g_union_start() routine that may be running in the g_down
+ * thread which cannot sleep.
+ */
+ sc->sc_map_size = roundup(size / secsize, BITS_PER_ENTRY);
+ needed = sc->sc_map_size / BITS_PER_ENTRY;
+ for (sc->sc_root_size = 1;
+ sc->sc_root_size * sc->sc_root_size < needed;
+ sc->sc_root_size++)
+ continue;
+ sc->sc_writemap_root = g_malloc(sc->sc_root_size * sizeof(uint64_t *),
+ M_WAITOK | M_ZERO);
+ sc->sc_leaf_size = sc->sc_root_size;
+ sc->sc_bits_per_leaf = sc->sc_leaf_size * BITS_PER_ENTRY;
+ sc->sc_leafused = g_malloc(roundup(sc->sc_root_size, BITS_PER_ENTRY),
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < sc->sc_root_size; i++)
+ sc->sc_writemap_root[i] =
+ g_malloc(sc->sc_leaf_size * sizeof(uint64_t),
+ M_WAITOK | M_ZERO);
+ sc->sc_writemap_memory =
+ (sc->sc_root_size + sc->sc_root_size * sc->sc_leaf_size) *
+ sizeof(uint64_t) + roundup(sc->sc_root_size, BITS_PER_ENTRY);
+ if (verbose)
+ gctl_error(req, "Device %s created with memory map size %jd.",
+ gp->name, sc->sc_writemap_memory);
+ G_UNION_DEBUG(1, "Device %s created with memory map size %jd.",
+ gp->name, sc->sc_writemap_memory);
+ return;
+
+fail4:
+ g_detach(uppercp);
+fail3:
+ g_destroy_consumer(uppercp);
+ g_access(lowercp, -1, 0, -1);
+fail2:
+ g_detach(lowercp);
+fail1:
+ g_destroy_consumer(lowercp);
+ g_destroy_provider(newpp);
+ g_destroy_geom(gp);
+}
+
+/*
+ * Fetch named option and verify that it is positive.
+ */
+static intmax_t
+g_union_fetcharg(struct gctl_req *req, const char *name)
+{
+ intmax_t *val;
+
+ val = gctl_get_paraml_opt(req, name, sizeof(*val));
+ if (val == NULL)
+ return (0);
+ if (*val >= 0)
+ return (*val);
+ gctl_error(req, "Invalid '%s': negative value, using default.", name);
+ return (0);
+}
+
+/*
+ * Verify that a name is alphanumeric.
+ */
+static bool
+g_union_verify_nprefix(const char *name)
+{
+ int i;
+
+ for (i = 0; i < strlen(name); i++) {
+ if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) {
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
+ * Destroy a union device.
+ */
+static void
+g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ int *nargs, *force, error, i;
+ struct g_geom *gp;
+ const char *name;
+ char param[16];
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+ force = gctl_get_paraml(req, "force", sizeof(*force));
+ if (force == NULL) {
+ gctl_error(req, "No 'force' argument.");
+ return;
+ }
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ name = gctl_get_asciiparam(req, param);
+ if (name == NULL) {
+ gctl_msg(req, "No '%s' argument.", param);
+ continue;
+ }
+ if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0)
+ name += strlen(_PATH_DEV);
+ gp = g_union_find_geom(mp, name);
+ if (gp == NULL) {
+ gctl_msg(req, "Device %s is invalid.", name);
+ continue;
+ }
+ error = g_union_destroy(verbose ? req : NULL, gp, *force);
+ if (error != 0)
+ gctl_msg(req, "Error %d: cannot destroy device %s.",
+ error, gp->name);
+ }
+ gctl_post_messages(req);
+}
+
+/*
+ * Find a union geom.
+ */
+static struct g_geom *
+g_union_find_geom(struct g_class *mp, const char *name)
+{
+ struct g_geom *gp;
+
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (strcmp(gp->name, name) == 0)
+ return (gp);
+ }
+ return (NULL);
+}
+
+/*
+ * Zero out all the statistics associated with a union device.
+ */
+static void
+g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp;
+ struct g_geom *gp;
+ char param[16];
+ int i, *nargs;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ pp = gctl_get_provider(req, param);
+ if (pp == NULL) {
+ gctl_msg(req, "No '%s' argument.", param);
+ continue;
+ }
+ gp = pp->geom;
+ if (gp->class != mp) {
+ gctl_msg(req, "Provider %s is invalid.",
+ pp->name);
+ continue;
+ }
+ sc = gp->softc;
+ sc->sc_reads = 0;
+ sc->sc_writes = 0;
+ sc->sc_deletes = 0;
+ sc->sc_getattrs = 0;
+ sc->sc_flushes = 0;
+ sc->sc_speedups = 0;
+ sc->sc_cmd0s = 0;
+ sc->sc_cmd1s = 0;
+ sc->sc_cmd2s = 0;
+ sc->sc_readbytes = 0;
+ sc->sc_wrotebytes = 0;
+ if (verbose)
+ gctl_msg(req, "Device %s has been reset.", pp->name);
+ G_UNION_DEBUG(1, "Device %s has been reset.", pp->name);
+ }
+ gctl_post_messages(req);
+}
+
+/*
+ * Revert all write requests made to the top layer of the union.
+ */
+static void
+g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp;
+ struct g_geom *gp;
+ char param[16];
+ int i, *nargs;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ pp = gctl_get_provider(req, param);
+ if (pp == NULL) {
+ gctl_msg(req, "No '%s' argument.", param);
+ continue;
+ }
+ gp = pp->geom;
+ if (gp->class != mp) {
+ gctl_msg(req, "Provider %s is invalid.", pp->name);
+ continue;
+ }
+ sc = gp->softc;
+ if (g_union_get_writelock(sc) != 0) {
+ gctl_msg(req, "Revert already in progress for "
+ "provider %s.", pp->name);
+ continue;
+ }
+ /*
+ * No mount or other use of union is allowed.
+ */
+ if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) {
+ gctl_msg(req, "Unable to get exclusive access for "
+ "reverting of %s;\n\t%s cannot be mounted or "
+ "otherwise open during a revert.",
+ pp->name, pp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ g_union_revert(sc);
+ g_union_rel_writelock(sc);
+ if (verbose)
+ gctl_msg(req, "Device %s has been reverted.", pp->name);
+ G_UNION_DEBUG(1, "Device %s has been reverted.", pp->name);
+ }
+ gctl_post_messages(req);
+}
+
+/*
+ * Revert union writes by zero'ing out the writemap.
+ */
+static void
+g_union_revert(struct g_union_softc *sc)
+{
+ int i;
+
+ G_WLOCK(sc);
+ for (i = 0; i < sc->sc_root_size; i++)
+ memset(sc->sc_writemap_root[i], 0,
+ sc->sc_leaf_size * sizeof(uint64_t));
+ memset(sc->sc_leafused, 0, roundup(sc->sc_root_size, BITS_PER_ENTRY));
+ G_WUNLOCK(sc);
+}
+
+/*
+ * Commit all the writes made in the top layer to the lower layer.
+ */
+static void
+g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool verbose)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp, *lowerpp;
+ struct g_consumer *lowercp;
+ struct g_geom *gp;
+ struct bio *bp;
+ char param[16];
+ off_t len2rd, len2wt, savelen;
+ int i, error, error1, *nargs, *force, *reboot;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ if (*nargs <= 0) {
+ gctl_error(req, "Missing device(s).");
+ return;
+ }
+ force = gctl_get_paraml(req, "force", sizeof(*force));
+ if (force == NULL) {
+ gctl_error(req, "No 'force' argument.");
+ return;
+ }
+ reboot = gctl_get_paraml(req, "reboot", sizeof(*reboot));
+ if (reboot == NULL) {
+ gctl_error(req, "No 'reboot' argument.");
+ return;
+ }
+
+ /* Get a bio buffer to do our I/O */
+ bp = g_alloc_bio();
+ bp->bio_data = g_malloc(MAXBSIZE, M_WAITOK);
+ bp->bio_done = biodone;
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ pp = gctl_get_provider(req, param);
+ if (pp == NULL) {
+ gctl_msg(req, "No '%s' argument.", param);
+ continue;
+ }
+ gp = pp->geom;
+ if (gp->class != mp) {
+ gctl_msg(req, "Provider %s is invalid.", pp->name);
+ continue;
+ }
+ sc = gp->softc;
+ if (g_union_get_writelock(sc) != 0) {
+ gctl_msg(req, "Commit already in progress for "
+ "provider %s.", pp->name);
+ continue;
+ }
+
+ /* upgrade to write access for lower */
+ lowercp = sc->sc_lowercp;
+ lowerpp = lowercp->provider;
+ /*
+ * No mount or other use of union is allowed, unless the
+ * -f flag is given which allows read-only mount or usage.
+ */
+ if ((*force == false && pp->acr > 0) || pp->acw > 0 ||
+ pp->ace > 0) {
+ gctl_msg(req, "Unable to get exclusive access for "
+ "writing of %s.\n\tNote that %s cannot be mounted "
+ "or otherwise\n\topen during a commit unless the "
+ "-f flag is used.", pp->name, pp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ /*
+ * No mount or other use of lower media is allowed, unless the
+ * -f flag is given which allows read-only mount or usage.
+ */
+ if ((*force == false && lowerpp->acr > lowercp->acr) ||
+ lowerpp->acw > lowercp->acw ||
+ lowerpp->ace > lowercp->ace) {
+ gctl_msg(req, "provider %s is unable to get "
+ "exclusive access to %s\n\tfor writing. Note that "
+ "%s cannot be mounted or otherwise open\n\tduring "
+ "a commit unless the -f flag is used.", pp->name,
+ lowerpp->name, lowerpp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ if ((error = g_access(lowercp, 0, 1, 0)) != 0) {
+ gctl_msg(req, "Error %d: provider %s is unable to "
+ "access %s for writing.", error, pp->name,
+ lowerpp->name);
+ g_union_rel_writelock(sc);
+ continue;
+ }
+ g_topology_unlock();
+ /* Loop over write map copying across written blocks */
+ bp->bio_offset = 0;
+ bp->bio_length = sc->sc_map_size * sc->sc_sectorsize;
+ G_RLOCK(sc);
+ error = 0;
+ while (bp->bio_length > 0) {
+ if (!g_union_getmap(bp, sc, &len2rd)) {
+ /* not written, so skip */
+ bp->bio_offset += len2rd;
+ bp->bio_length -= len2rd;
+ continue;
+ }
+ G_RUNLOCK(sc);
+ /* need to read then write len2rd sectors */
+ for ( ; len2rd > 0; len2rd -= len2wt) {
+ /* limit ourselves to MAXBSIZE size I/Os */
+ len2wt = len2rd;
+ if (len2wt > MAXBSIZE)
+ len2wt = MAXBSIZE;
+ savelen = bp->bio_length;
+ bp->bio_length = len2wt;
+ bp->bio_cmd = BIO_READ;
+ g_io_request(bp, sc->sc_uppercp);
+ if ((error = biowait(bp, "rdunion")) != 0) {
+ gctl_msg(req, "Commit read error %d "
+ "in provider %s, commit aborted.",
+ error, pp->name);
+ goto cleanup;
+ }
+ bp->bio_flags &= ~BIO_DONE;
+ bp->bio_cmd = BIO_WRITE;
+ g_io_request(bp, lowercp);
+ if ((error = biowait(bp, "wtunion")) != 0) {
+ gctl_msg(req, "Commit write error %d "
+ "in provider %s, commit aborted.",
+ error, pp->name);
+ goto cleanup;
+ }
+ bp->bio_flags &= ~BIO_DONE;
+ bp->bio_offset += len2wt;
+ bp->bio_length = savelen - len2wt;
+ }
+ G_RLOCK(sc);
+ }
+ G_RUNLOCK(sc);
+ /* clear the write map */
+ g_union_revert(sc);
+cleanup:
+ g_topology_lock();
+ /* return lower to previous access */
+ if ((error1 = g_access(lowercp, 0, -1, 0)) != 0) {
+ G_UNION_DEBUG(2, "Error %d: device %s could not reset "
+ "access to %s (r=0 w=-1 e=0).", error1, pp->name,
+ lowerpp->name);
+ }
+ g_union_rel_writelock(sc);
+ if (error == 0 && verbose)
+ gctl_msg(req, "Device %s has been committed.",
+ pp->name);
+ G_UNION_DEBUG(1, "Device %s has been committed.", pp->name);
+ }
+ gctl_post_messages(req);
+ g_free(bp->bio_data);
+ g_destroy_bio(bp);
+ if (*reboot)
+ kern_reboot(RB_AUTOBOOT);
+}
+
+/*
+ * Generally allow access unless a commit is in progress.
+ */
+static int
+g_union_access(struct g_provider *pp, int r, int w, int e)
+{
+ struct g_union_softc *sc;
+
+ sc = pp->geom->softc;
+ if (sc == NULL) {
+ if (r <= 0 && w <= 0 && e <= 0)
+ return (0);
+ return (ENXIO);
+ }
+ r += pp->acr;
+ w += pp->acw;
+ e += pp->ace;
+ if (g_union_get_writelock(sc) != 0) {
+ if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0)
+ return (0);
+ return (EBUSY);
+ }
+ g_union_rel_writelock(sc);
+ return (0);
+}
+
+/*
+ * Initiate an I/O operation on the union device.
+ */
+static void
+g_union_start(struct bio *bp)
+{
+ struct g_union_softc *sc;
+ struct g_union_wip *wip;
+ struct bio *cbp;
+
+ sc = bp->bio_to->geom->softc;
+ if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
+ wip = g_malloc(sizeof(*wip), M_NOWAIT);
+ if (wip == NULL) {
+ g_io_deliver(bp, ENOMEM);
+ return;
+ }
+ TAILQ_INIT(&wip->wip_waiting);
+ wip->wip_bp = bp;
+ wip->wip_sc = sc;
+ wip->wip_start = bp->bio_offset + sc->sc_offset;
+ wip->wip_end = wip->wip_start + bp->bio_length - 1;
+ wip->wip_numios = 1;
+ wip->wip_error = 0;
+ g_union_doio(wip);
+ return;
+ }
+
+ /*
+ * All commands other than read and write are passed through to
+ * the upper-level device since it is writable and thus able to
+ * respond to delete, flush, and speedup requests.
+ */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL) {
+ g_io_deliver(bp, ENOMEM);
+ return;
+ }
+ cbp->bio_offset = bp->bio_offset + sc->sc_offset;
+ cbp->bio_done = g_std_done;
+
+ switch (cbp->bio_cmd) {
+ case BIO_DELETE:
+ G_UNION_LOGREQ(cbp, "Delete request received.");
+ atomic_add_long(&sc->sc_deletes, 1);
+ break;
+ case BIO_GETATTR:
+ G_UNION_LOGREQ(cbp, "Getattr request received.");
+ atomic_add_long(&sc->sc_getattrs, 1);
+ if (strcmp(cbp->bio_attribute, "GEOM::kerneldump") != 0)
+ /* forward the GETATTR to the lower-level device */
+ break;
+ g_union_kerneldump(bp, sc);
+ return;
+ case BIO_FLUSH:
+ G_UNION_LOGREQ(cbp, "Flush request received.");
+ atomic_add_long(&sc->sc_flushes, 1);
+ break;
+ case BIO_SPEEDUP:
+ G_UNION_LOGREQ(cbp, "Speedup request received.");
+ atomic_add_long(&sc->sc_speedups, 1);
+ break;
+ case BIO_CMD0:
+ G_UNION_LOGREQ(cbp, "Cmd0 request received.");
+ atomic_add_long(&sc->sc_cmd0s, 1);
+ break;
+ case BIO_CMD1:
+ G_UNION_LOGREQ(cbp, "Cmd1 request received.");
+ atomic_add_long(&sc->sc_cmd1s, 1);
+ break;
+ case BIO_CMD2:
+ G_UNION_LOGREQ(cbp, "Cmd2 request received.");
+ atomic_add_long(&sc->sc_cmd2s, 1);
+ break;
+ default:
+ G_UNION_LOGREQ(cbp, "Unknown (%d) request received.",
+ cbp->bio_cmd);
+ break;
+ }
+ g_io_request(cbp, sc->sc_uppercp);
+}
+
+/*
+ * Initiate a read or write operation on the union device.
+ */
+static void
+g_union_doio(struct g_union_wip *wip)
+{
+ struct g_union_softc *sc;
+ struct g_consumer *cp, *firstcp;
+ struct g_union_wip *activewip;
+ struct bio *cbp, *firstbp;
+ off_t rdlen, len2rd, offset;
+ int iocnt, needstoblock;
+ char *level;
+
+ /*
+ * To maintain consistency, we cannot allow concurrent reads
+ * or writes to the same block.
+ *
+ * A work-in-progress (wip) structure is allocated for each
+ * read or write request. All active requests are kept on the
+ * softc sc_wiplist. As each request arrives, it is checked to
+ * see if it overlaps any of the active entries. If it does not
+ * overlap, then it is added to the active list and initiated.
+ * If it does overlap an active entry, it is added to the
+ * wip_waiting list for the active entry that it overlaps.
+ * When an active entry completes, it restarts all the requests
+ * on its wip_waiting list.
+ */
+ sc = wip->wip_sc;
+ G_WLOCK(sc);
+ TAILQ_FOREACH(activewip, &sc->sc_wiplist, wip_next) {
+ if (wip->wip_end < activewip->wip_start ||
+ wip->wip_start > activewip->wip_end)
+ continue;
+ needstoblock = 1;
+ if (wip->wip_bp->bio_cmd == BIO_WRITE)
+ if (activewip->wip_bp->bio_cmd == BIO_WRITE)
+ sc->sc_writeblockwrite += 1;
+ else
+ sc->sc_readblockwrite += 1;
+ else
+ if (activewip->wip_bp->bio_cmd == BIO_WRITE)
+ sc->sc_writeblockread += 1;
+ else {
+ sc->sc_readcurrentread += 1;
+ needstoblock = 0;
+ }
+ /* Put request on a waiting list if necessary */
+ if (needstoblock) {
+ TAILQ_INSERT_TAIL(&activewip->wip_waiting, wip,
+ wip_next);
+ G_WUNLOCK(sc);
+ return;
+ }
+ }
+ /* Put request on the active list */
+ TAILQ_INSERT_TAIL(&sc->sc_wiplist, wip, wip_next);
+
+ /*
+ * Process I/O requests that have been cleared to go.
+ */
+ cbp = g_clone_bio(wip->wip_bp);
+ if (cbp == NULL) {
+ TAILQ_REMOVE(&sc->sc_wiplist, wip, wip_next);
+ G_WUNLOCK(sc);
+ KASSERT(TAILQ_FIRST(&wip->wip_waiting) == NULL,
+ ("g_union_doio: non-empty work-in-progress waiting queue"));
+ g_io_deliver(wip->wip_bp, ENOMEM);
+ g_free(wip);
+ return;
+ }
+ G_WUNLOCK(sc);
+ cbp->bio_caller1 = wip;
+ cbp->bio_done = g_union_done;
+ cbp->bio_offset = wip->wip_start;
+
+ /*
+ * Writes are always done to the top level. The blocks that
+ * are written are recorded in the bitmap when the I/O completes.
+ */
+ if (cbp->bio_cmd == BIO_WRITE) {
+ G_UNION_LOGREQ(cbp, "Sending %jd byte write request to upper "
+ "level.", cbp->bio_length);
+ atomic_add_long(&sc->sc_writes, 1);
+ atomic_add_long(&sc->sc_wrotebytes, cbp->bio_length);
+ g_io_request(cbp, sc->sc_uppercp);
+ return;
+ }
+ /*
+ * The usual read case is that we either read the top layer
+ * if the block has been previously written or the bottom layer
+ * if it has not been written. However, it is possible that
+ * only part of the block has been written, For example we may
+ * have written a UFS/FFS file fragment comprising several
+ * sectors out of an 8-sector block. Here, if the entire
+ * 8-sector block is read for example by a snapshot needing
+ * to copy the full block, then we need to read the written
+ * sectors from the upper level and the unwritten sectors from
+ * the lower level. We do this by alternately reading from the
+ * top and bottom layers until we complete the read. We
+ * simplify for the common case to just do the I/O and return.
+ */
+ atomic_add_long(&sc->sc_reads, 1);
+ atomic_add_long(&sc->sc_readbytes, cbp->bio_length);
+ rdlen = cbp->bio_length;
+ offset = 0;
+ for (iocnt = 0; ; iocnt++) {
+ if (g_union_getmap(cbp, sc, &len2rd)) {
+ /* read top */
+ cp = sc->sc_uppercp;
+ level = "upper";
+ } else {
+ /* read bottom */
+ cp = sc->sc_lowercp;
+ level = "lower";
+ }
+ /* Check if only a single read is required */
+ if (iocnt == 0 && rdlen == len2rd) {
+ G_UNION_LOGREQLVL((cp == sc->sc_uppercp) ?
+ 3 : 4, cbp, "Sending %jd byte read "
+ "request to %s level.", len2rd, level);
+ g_io_request(cbp, cp);
+ return;
+ }
+ cbp->bio_length = len2rd;
+ if ((cbp->bio_flags & BIO_UNMAPPED) != 0)
+ cbp->bio_ma_offset += offset;
+ else
+ cbp->bio_data += offset;
+ offset += len2rd;
+ rdlen -= len2rd;
+ G_UNION_LOGREQLVL(3, cbp, "Sending %jd byte read "
+ "request to %s level.", len2rd, level);
+ /*
+ * To avoid prematurely notifying our consumer
+ * that their I/O has completed, we have to delay
+ * issuing our first I/O request until we have
+ * issued all the additional I/O requests.
+ */
+ if (iocnt > 0) {
+ atomic_add_long(&wip->wip_numios, 1);
+ g_io_request(cbp, cp);
+ } else {
+ firstbp = cbp;
+ firstcp = cp;
+ }
+ if (rdlen == 0)
+ break;
+ /* set up for next read */
+ cbp = g_clone_bio(wip->wip_bp);
+ if (cbp == NULL) {
+ wip->wip_error = ENOMEM;
+ atomic_add_long(&wip->wip_numios, -1);
+ break;
+ }
+ cbp->bio_caller1 = wip;
+ cbp->bio_done = g_union_done;
+ cbp->bio_offset += offset;
+ cbp->bio_length = rdlen;
+ atomic_add_long(&sc->sc_reads, 1);
+ }
+ /* We have issued all our I/O, so start the first one */
+ g_io_request(firstbp, firstcp);
+ return;
+}
+
+/*
+ * Used when completing a union I/O operation.
+ */
+static void
+g_union_done(struct bio *bp)
+{
+ struct g_union_wip *wip, *waitingwip;
+ struct g_union_softc *sc;
+
+ wip = bp->bio_caller1;
+ if (wip->wip_error != 0 && bp->bio_error == 0)
+ bp->bio_error = wip->wip_error;
+ wip->wip_error = 0;
+ if (atomic_fetchadd_long(&wip->wip_numios, -1) == 1) {
+ sc = wip->wip_sc;
+ G_WLOCK(sc);
+ if (bp->bio_cmd == BIO_WRITE)
+ g_union_setmap(bp, sc);
+ TAILQ_REMOVE(&sc->sc_wiplist, wip, wip_next);
+ G_WUNLOCK(sc);
+ while ((waitingwip = TAILQ_FIRST(&wip->wip_waiting)) != NULL) {
+ TAILQ_REMOVE(&wip->wip_waiting, waitingwip, wip_next);
+ g_union_doio(waitingwip);
+ }
+ g_free(wip);
+ }
+ g_std_done(bp);
+}
+
+/*
+ * Record blocks that have been written in the map.
+ */
+static void
+g_union_setmap(struct bio *bp, struct g_union_softc *sc)
+{
+ size_t root_idx;
+ uint64_t **leaf;
+ uint64_t *wordp;
+ off_t start, numsec;
+
+ G_WLOCKOWNED(sc);
+ KASSERT(bp->bio_offset % sc->sc_sectorsize == 0,
+ ("g_union_setmap: offset not on sector boundry"));
+ KASSERT(bp->bio_length % sc->sc_sectorsize == 0,
+ ("g_union_setmap: length not a multiple of sectors"));
+ start = bp->bio_offset / sc->sc_sectorsize;
+ numsec = bp->bio_length / sc->sc_sectorsize;
+ KASSERT(start + numsec <= sc->sc_map_size,
+ ("g_union_setmap: block %jd is out of range", start + numsec));
+ for ( ; numsec > 0; numsec--, start++) {
+ root_idx = start / sc->sc_bits_per_leaf;
+ leaf = &sc->sc_writemap_root[root_idx];
+ wordp = &(*leaf)
+ [(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY];
+ *wordp |= 1ULL << (start % BITS_PER_ENTRY);
+ sc->sc_leafused[root_idx / BITS_PER_ENTRY] |=
+ 1ULL << (root_idx % BITS_PER_ENTRY);
+ }
+}
+
+/*
+ * Check map to determine whether blocks have been written.
+ *
+ * Return true if they have been written so should be read from the top
+ * layer. Return false if they have not been written so should be read
+ * from the bottom layer. Return in len2read the bytes to be read. See
+ * the comment above the BIO_READ implementation in g_union_start() for
+ * an explantion of why len2read may be shorter than the buffer length.
+ */
+static bool
+g_union_getmap(struct bio *bp, struct g_union_softc *sc, off_t *len2read)
+{
+ off_t start, numsec, leafresid, bitloc;
+ bool first, maptype, retval;
+ uint64_t *leaf, word;
+ size_t root_idx;
+
+ KASSERT(bp->bio_offset % sc->sc_sectorsize == 0,
+ ("g_union_getmap: offset not on sector boundry"));
+ KASSERT(bp->bio_length % sc->sc_sectorsize == 0,
+ ("g_union_getmap: length not a multiple of sectors"));
+ start = bp->bio_offset / sc->sc_sectorsize;
+ numsec = bp->bio_length / sc->sc_sectorsize;
+ G_UNION_DEBUG(4, "g_union_getmap: check %jd sectors starting at %jd\n",
+ numsec, start);
+ KASSERT(start + numsec <= sc->sc_map_size,
+ ("g_union_getmap: block %jd is out of range", start + numsec));
+ root_idx = start / sc->sc_bits_per_leaf;
+ first = true;
+ maptype = false;
+ while (numsec > 0) {
+ /* Check first if the leaf records any written sectors */
+ root_idx = start / sc->sc_bits_per_leaf;
+ leafresid = sc->sc_bits_per_leaf -
+ (start % sc->sc_bits_per_leaf);
+ if (((sc->sc_leafused[root_idx / BITS_PER_ENTRY]) &
+ (1ULL << (root_idx % BITS_PER_ENTRY))) == 0) {
+ if (first) {
+ maptype = false;
+ first = false;
+ }
+ if (maptype)
+ break;
+ numsec -= leafresid;
+ start += leafresid;
+ continue;
+ }
+ /* Check up to a word boundry, then check word by word */
+ leaf = sc->sc_writemap_root[root_idx];
+ word = leaf[(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY];
+ bitloc = start % BITS_PER_ENTRY;
+ if (bitloc == 0 && (word == 0 || word == ~0)) {
+ if (first) {
+ if (word == 0)
+ maptype = false;
+ else
+ maptype = true;
+ first = false;
+ }
+ if ((word == 0 && maptype) ||
+ (word == ~0 && !maptype))
+ break;
+ numsec -= BITS_PER_ENTRY;
+ start += BITS_PER_ENTRY;
+ continue;
+ }
+ for ( ; bitloc < BITS_PER_ENTRY; bitloc ++) {
+ retval = (word & (1ULL << bitloc)) != 0;
+ if (first) {
+ maptype = retval;
+ first = false;
+ }
+ if (maptype == retval) {
+ numsec--;
+ start++;
+ continue;
+ }
+ goto out;
+ }
+ }
+out:
+ if (numsec < 0) {
+ start += numsec;
+ numsec = 0;
+ }
+ *len2read = bp->bio_length - (numsec * sc->sc_sectorsize);
+ G_UNION_DEBUG(maptype ? 3 : 4,
+ "g_union_getmap: return maptype %swritten for %jd "
+ "sectors ending at %jd\n", maptype ? "" : "NOT ",
+ *len2read / sc->sc_sectorsize, start - 1);
+ return (maptype);
+}
+
+/*
+ * Fill in details for a BIO_GETATTR request.
+ */
+static void
+g_union_kerneldump(struct bio *bp, struct g_union_softc *sc)
+{
+ struct g_kerneldump *gkd;
+ struct g_geom *gp;
+ struct g_provider *pp;
+
+ gkd = (struct g_kerneldump *)bp->bio_data;
+ gp = bp->bio_to->geom;
+ g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name,
+ (intmax_t)gkd->offset, (intmax_t)gkd->length);
+
+ pp = LIST_FIRST(&gp->provider);
+
+ gkd->di.dumper = g_union_dumper;
+ gkd->di.priv = sc;
+ gkd->di.blocksize = pp->sectorsize;
+ gkd->di.maxiosize = DFLTPHYS;
+ gkd->di.mediaoffset = sc->sc_offset + gkd->offset;
+ if (gkd->offset > sc->sc_size) {
+ g_io_deliver(bp, ENODEV);
+ return;
+ }
+ if (gkd->offset + gkd->length > sc->sc_size)
+ gkd->length = sc->sc_size - gkd->offset;
+ gkd->di.mediasize = gkd->length;
+ g_io_deliver(bp, 0);
+}
+
+/*
+ * Handler for g_union_kerneldump().
+ */
+static int
+g_union_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset,
+ size_t length)
+{
+
+ return (0);
+}
+
+/*
+ * List union statistics.
+ */
+static void
+g_union_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
+ struct g_consumer *cp, struct g_provider *pp)
+{
+ struct g_union_softc *sc;
+
+ if (pp != NULL || cp != NULL || gp->softc == NULL)
+ return;
+ sc = gp->softc;
+ sbuf_printf(sb, "%s<Reads>%ju</Reads>\n", indent, sc->sc_reads);
+ sbuf_printf(sb, "%s<Writes>%ju</Writes>\n", indent, sc->sc_writes);
+ sbuf_printf(sb, "%s<Deletes>%ju</Deletes>\n", indent, sc->sc_deletes);
+ sbuf_printf(sb, "%s<Getattrs>%ju</Getattrs>\n", indent,
+ sc->sc_getattrs);
+ sbuf_printf(sb, "%s<Flushes>%ju</Flushes>\n", indent, sc->sc_flushes);
+ sbuf_printf(sb, "%s<Speedups>%ju</Speedups>\n", indent,
+ sc->sc_speedups);
+ sbuf_printf(sb, "%s<Cmd0s>%ju</Cmd0s>\n", indent, sc->sc_cmd0s);
+ sbuf_printf(sb, "%s<Cmd1s>%ju</Cmd1s>\n", indent, sc->sc_cmd1s);
+ sbuf_printf(sb, "%s<Cmd2s>%ju</Cmd2s>\n", indent, sc->sc_cmd2s);
+ sbuf_printf(sb, "%s<ReadCurrentRead>%ju</ReadCurrentRead>\n", indent,
+ sc->sc_readcurrentread);
+ sbuf_printf(sb, "%s<ReadBlockWrite>%ju</ReadBlockWrite>\n", indent,
+ sc->sc_readblockwrite);
+ sbuf_printf(sb, "%s<WriteBlockRead>%ju</WriteBlockRead>\n", indent,
+ sc->sc_writeblockread);
+ sbuf_printf(sb, "%s<WriteBlockWrite>%ju</WriteBlockWrite>\n", indent,
+ sc->sc_writeblockwrite);
+ sbuf_printf(sb, "%s<ReadBytes>%ju</ReadBytes>\n", indent,
+ sc->sc_readbytes);
+ sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
+ sc->sc_wrotebytes);
+ sbuf_printf(sb, "%s<Offset>%jd</Offset>\n", indent,
+ (intmax_t)sc->sc_offset);
+}
+
+/*
+ * Clean up an orphaned geom.
+ */
+static void
+g_union_orphan(struct g_consumer *cp)
+{
+
+ g_topology_assert();
+ g_union_destroy(NULL, cp->geom, true);
+}
+
+/*
+ * Clean up a union geom.
+ */
+static int
+g_union_destroy_geom(struct gctl_req *req, struct g_class *mp,
+ struct g_geom *gp)
+{
+
+ return (g_union_destroy(NULL, gp, false));
+}
+
+/*
+ * Clean up a union device.
+ */
+static int
+g_union_destroy(struct gctl_req *req, struct g_geom *gp, bool force)
+{
+ struct g_union_softc *sc;
+ struct g_provider *pp;
+ int error;
+
+ g_topology_assert();
+ sc = gp->softc;
+ if (sc == NULL)
+ return (ENXIO);
+ pp = LIST_FIRST(&gp->provider);
+ if ((sc->sc_flags & DOING_COMMIT) != 0 ||
+ (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0))) {
+ if (force) {
+ if (req != NULL)
+ gctl_msg(req, "Device %s is still in use, "
+ "so is being forcibly removed.", pp->name);
+ G_UNION_DEBUG(1, "Device %s is still in use, so "
+ "is being forcibly removed.", pp->name);
+ } else {
+ if (req != NULL)
+ gctl_msg(req, "Device %s is still open "
+ "(r=%d w=%d e=%d).", pp->name, pp->acr,
+ pp->acw, pp->ace);
+ G_UNION_DEBUG(1, "Device %s is still open "
+ "(r=%d w=%d e=%d).", pp->name, pp->acr,
+ pp->acw, pp->ace);
+ return (EBUSY);
+ }
+ } else {
+ if (req != NULL)
+ gctl_msg(req, "Device %s removed.", pp->name);
+ G_UNION_DEBUG(1, "Device %s removed.", pp->name);
+ }
+ /* Close consumers */
+ if ((error = g_access(sc->sc_lowercp, -1, 0, -1)) != 0)
+ G_UNION_DEBUG(2, "Error %d: device %s could not reset access "
+ "to %s.", error, pp->name, sc->sc_lowercp->provider->name);
+ if ((error = g_access(sc->sc_uppercp, -1, -1, -1)) != 0)
+ G_UNION_DEBUG(2, "Error %d: device %s could not reset access "
+ "to %s.", error, pp->name, sc->sc_uppercp->provider->name);
+
+ g_wither_geom(gp, ENXIO);
+
+ return (0);
+}
+
+/*
+ * Clean up a union provider.
+ */
+static void
+g_union_providergone(struct g_provider *pp)
+{
+ struct g_geom *gp;
+ struct g_union_softc *sc;
+ size_t i;
+
+ gp = pp->geom;
+ sc = gp->softc;
+ gp->softc = NULL;
+ for (i = 0; i < sc->sc_root_size; i++)
+ g_free(sc->sc_writemap_root[i]);
+ g_free(sc->sc_writemap_root);
+ g_free(sc->sc_leafused);
+ rw_destroy(&sc->sc_rwlock);
+ g_free(sc);
+}
+
+/*
+ * Respond to a resized provider.
+ */
+static void
+g_union_resize(struct g_consumer *cp)
+{
+ struct g_union_softc *sc;
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ gp = cp->geom;
+ sc = gp->softc;
+
+ /*
+ * If size has gotten bigger, ignore it and just keep using
+ * the space we already had. Otherwise we are done.
+ */
+ if (sc->sc_size < cp->provider->mediasize - sc->sc_offset)
+ return;
+ g_union_destroy(NULL, gp, true);
+}
+
+DECLARE_GEOM_CLASS(g_union_class, g_union);
+MODULE_VERSION(geom_union, 0);
diff --git a/sys/geom/union/g_union.h b/sys/geom/union/g_union.h
new file mode 100644
index 000000000000..56d954ea5d22
--- /dev/null
+++ b/sys/geom/union/g_union.h
@@ -0,0 +1,144 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Marshall Kirk McKusick <mckusick@mckusick.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _G_UNION_H_
+#define _G_UNION_H_
+
+#define G_UNION_CLASS_NAME "UNION"
+#define G_UNION_VERSION 1
+#define G_UNION_SUFFIX ".union"
+/*
+ * Special flag to instruct gunion to passthrough the underlying provider's
+ * physical path
+ */
+#define G_UNION_PHYSPATH_PASSTHROUGH "\255"
+
+#ifdef _KERNEL
+#define G_UNION_DEBUG(lvl, ...) \
+ _GEOM_DEBUG("GEOM_UNION", g_union_debug, (lvl), NULL, __VA_ARGS__)
+#define G_UNION_LOGREQLVL(lvl, bp, ...) \
+ _GEOM_DEBUG("GEOM_UNION", g_union_debug, (lvl), (bp), __VA_ARGS__)
+#define G_UNION_LOGREQ(bp, ...) G_UNION_LOGREQLVL(3, (bp), __VA_ARGS__)
+
+TAILQ_HEAD(wiplist, g_union_wip);
+
+/*
+ * State maintained by each instance of a UNION GEOM.
+ */
+struct g_union_softc {
+ struct rwlock sc_rwlock; /* writemap lock */
+ uint64_t **sc_writemap_root; /* root of write map */
+ uint64_t *sc_leafused; /* 1 => leaf has allocation */
+ uint64_t sc_map_size; /* size of write map */
+ long sc_root_size; /* entries in root node */
+ long sc_leaf_size; /* entries in leaf node */
+ long sc_bits_per_leaf; /* bits per leaf node entry */
+ long sc_writemap_memory; /* memory used by writemap */
+ off_t sc_offset; /* starting offset in lower */
+ off_t sc_size; /* size of union geom */
+ off_t sc_sectorsize; /* sector size of geom */
+ struct g_consumer *sc_uppercp; /* upper-level provider */
+ struct g_consumer *sc_lowercp; /* lower-level provider */
+ struct wiplist sc_wiplist; /* I/O work-in-progress list */
+ long sc_flags; /* see flags below */
+ long sc_reads; /* number of reads done */
+ long sc_wrotebytes; /* number of bytes written */
+ long sc_writes; /* number of writes done */
+ long sc_readbytes; /* number of bytes read */
+ long sc_deletes; /* number of deletes done */
+ long sc_getattrs; /* number of getattrs done */
+ long sc_flushes; /* number of flushes done */
+ long sc_cmd0s; /* number of cmd0's done */
+ long sc_cmd1s; /* number of cmd1's done */
+ long sc_cmd2s; /* number of cmd2's done */
+ long sc_speedups; /* number of speedups done */
+ long sc_readcurrentread; /* reads current with read */
+ long sc_readblockwrite; /* writes blocked by read */
+ long sc_writeblockread; /* reads blocked by write */
+ long sc_writeblockwrite; /* writes blocked by write */
+};
+
+/*
+ * Structure to track work-in-progress I/O operations.
+ *
+ * Used to prevent overlapping I/O operations from running concurrently.
+ * Created for each I/O operation.
+ *
+ * In usual case of no overlap it is linked to sc_wiplist and started.
+ * If found to overlap an I/O on sc_wiplist, it is not started and is
+ * linked to wip_waiting list of the I/O that it overlaps. When an I/O
+ * completes, it restarts all the I/O operations on its wip_waiting list.
+ */
+struct g_union_wip {
+ struct wiplist wip_waiting; /* list of I/Os waiting on me */
+ TAILQ_ENTRY(g_union_wip) wip_next; /* pending or active I/O list */
+ struct bio *wip_bp; /* bio for this I/O */
+ struct g_union_softc *wip_sc; /* g_union's softc */
+ off_t wip_start; /* starting offset of I/O */
+ off_t wip_end; /* ending offset of I/O */
+ long wip_numios; /* BIO_READs in progress */
+ long wip_error; /* merged I/O errors */
+};
+
+/*
+ * UNION flags
+ */
+#define DOING_COMMIT 0x00000001 /* a commit command is in progress */
+
+#define DOING_COMMIT_BITNUM 0 /* a commit command is in progress */
+
+#define BITS_PER_ENTRY (sizeof(uint64_t) * NBBY)
+#define G_RLOCK(sc) rw_rlock(&(sc)->sc_rwlock)
+#define G_RUNLOCK(sc) rw_runlock(&(sc)->sc_rwlock)
+#define G_WLOCK(sc) rw_wlock(&(sc)->sc_rwlock)
+#define G_WUNLOCK(sc) rw_wunlock(&(sc)->sc_rwlock)
+#define G_WLOCKOWNED(sc) rw_assert(&(sc)->sc_rwlock, RA_WLOCKED)
+
+/*
+ * The writelock is held while a commit operation is in progress.
+ * While held union device may not be used or in use.
+ * Returns == 0 if lock was successfully obtained.
+ */
+static inline int
+g_union_get_writelock(struct g_union_softc *sc)
+{
+
+ return (atomic_testandset_long(&sc->sc_flags, DOING_COMMIT_BITNUM));
+}
+
+static inline void
+g_union_rel_writelock(struct g_union_softc *sc)
+{
+ long ret __diagused;
+
+ ret = atomic_testandclear_long(&sc->sc_flags, DOING_COMMIT_BITNUM);
+ KASSERT(ret != 0, ("UNION GEOM releasing unheld lock"));
+}
+
+#endif /* _KERNEL */
+
+#endif /* _G_UNION_H_ */
diff --git a/sys/modules/geom/Makefile b/sys/modules/geom/Makefile
index 51f3d2438eb1..f2d5c931f168 100644
--- a/sys/modules/geom/Makefile
+++ b/sys/modules/geom/Makefile
@@ -21,6 +21,7 @@ SUBDIR= geom_bde \
geom_raid3 \
geom_shsec \
geom_stripe \
+ geom_union \
geom_uzip \
geom_vinum \
geom_virstor \
diff --git a/sys/modules/geom/geom_union/Makefile b/sys/modules/geom/geom_union/Makefile
new file mode 100644
index 000000000000..9b179c41a6d5
--- /dev/null
+++ b/sys/modules/geom/geom_union/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/geom/union
+
+KMOD= geom_union
+SRCS= g_union.c
+
+.include <bsd.kmod.mk>