aboutsummaryrefslogtreecommitdiff
path: root/usr.sbin
diff options
context:
space:
mode:
authorJohn Baldwin <jhb@FreeBSD.org>2020-05-05 00:02:04 +0000
committerJohn Baldwin <jhb@FreeBSD.org>2020-05-05 00:02:04 +0000
commit483d953a86a2507355f8287c5107dc827a0ff516 (patch)
treeb74b1559959b80cd48bffb9763b67959fd4c182b /usr.sbin
parent51a5392297a7a014b1cf367922359cd54fb7a393 (diff)
downloadsrc-483d953a86a2507355f8287c5107dc827a0ff516.tar.gz
src-483d953a86a2507355f8287c5107dc827a0ff516.zip
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot to be taken of a guest's state that can later be resumed. In the current implementation, bhyve(8) creates a UNIX domain socket that is used by bhyvectl(8) to send a request to save a snapshot (and optionally exit after the snapshot has been taken). A snapshot currently consists of two files: the first holds a copy of guest RAM, and the second file holds other guest state such as vCPU register values and device model state. To resume a guest, bhyve(8) must be started with a matching pair of command line arguments to instantiate the same set of device models as well as a pointer to the saved snapshot. While the current implementation is useful for several uses cases, it has a few limitations. The file format for saving the guest state is tied to the ABI of internal bhyve structures and is not self-describing (in that it does not communicate the set of device models present in the system). In addition, the state saved for some device models closely matches the internal data structures which might prove a challenge for compatibility of snapshot files across a range of bhyve versions. The file format also does not currently support versioning of individual chunks of state. As a result, the current file format is not a fixed binary format and future revisions to save and restore will break binary compatiblity of snapshot files. The goal is to move to a more flexible format that adds versioning, etc. and at that point to commit to providing a reasonable level of compatibility. As a result, the current implementation is not enabled by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option for userland builds, and the kernel option BHYVE_SHAPSHOT. Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz Relnotes: yes Sponsored by: University Politehnica of Bucharest Sponsored by: Matthew Grooms (student scholarships) Sponsored by: iXsystems Differential Revision: https://reviews.freebsd.org/D19495
Notes
Notes: svn path=/head/; revision=360648
Diffstat (limited to 'usr.sbin')
-rw-r--r--usr.sbin/bhyve/Makefile15
-rw-r--r--usr.sbin/bhyve/Makefile.depend2
-rw-r--r--usr.sbin/bhyve/atkbdc.c46
-rw-r--r--usr.sbin/bhyve/atkbdc.h5
-rw-r--r--usr.sbin/bhyve/bhyve.818
-rw-r--r--usr.sbin/bhyve/bhyverun.c196
-rw-r--r--usr.sbin/bhyve/bhyverun.h5
-rw-r--r--usr.sbin/bhyve/block_if.c143
-rw-r--r--usr.sbin/bhyve/block_if.h11
-rw-r--r--usr.sbin/bhyve/mevent.c2
-rw-r--r--usr.sbin/bhyve/pci_ahci.c302
-rw-r--r--usr.sbin/bhyve/pci_e82545.c161
-rw-r--r--usr.sbin/bhyve/pci_emul.c208
-rw-r--r--usr.sbin/bhyve/pci_emul.h11
-rw-r--r--usr.sbin/bhyve/pci_fbuf.c19
-rw-r--r--usr.sbin/bhyve/pci_lpc.c26
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c51
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c83
-rw-r--r--usr.sbin/bhyve/pci_xhci.c264
-rw-r--r--usr.sbin/bhyve/ps2kbd.c17
-rw-r--r--usr.sbin/bhyve/ps2kbd.h5
-rw-r--r--usr.sbin/bhyve/ps2mouse.c24
-rw-r--r--usr.sbin/bhyve/ps2mouse.h5
-rw-r--r--usr.sbin/bhyve/snapshot.c1742
-rw-r--r--usr.sbin/bhyve/snapshot.h105
-rw-r--r--usr.sbin/bhyve/uart_emul.c34
-rw-r--r--usr.sbin/bhyve/uart_emul.h5
-rw-r--r--usr.sbin/bhyve/usb_emul.h4
-rw-r--r--usr.sbin/bhyve/usb_mouse.c30
-rw-r--r--usr.sbin/bhyve/virtio.c148
-rw-r--r--usr.sbin/bhyve/virtio.h10
-rw-r--r--usr.sbin/bhyvectl/Makefile6
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.819
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.c119
34 files changed, 3799 insertions, 42 deletions
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 12bd477825bf..9a4460a1b90f 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -72,10 +72,17 @@ SRCS= \
spinup_ap.c \
iov.c
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+SRCS+= snapshot.c
+.endif
+
.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm
SRCS+= vmm_instruction_emul.c
LIBADD= vmmapi md pthread z util sbuf cam
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+LIBADD+= ucl xo
+.endif
.if ${MK_INET_SUPPORT} != "no"
CFLAGS+=-DINET
@@ -92,6 +99,14 @@ LIBADD+= crypto
CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000
CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii
CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+CFLAGS+= -I${SRCTOP}/contrib/libucl/include
+
+# Temporary disable capsicum, until we integrate checkpoint code with it.
+CFLAGS+= -DWITHOUT_CAPSICUM
+
+CFLAGS+= -DBHYVE_SNAPSHOT
+.endif
.ifdef GDB_LOG
CFLAGS+=-DGDB_LOG
diff --git a/usr.sbin/bhyve/Makefile.depend b/usr.sbin/bhyve/Makefile.depend
index 8d3ff079d277..8222ceb6ad25 100644
--- a/usr.sbin/bhyve/Makefile.depend
+++ b/usr.sbin/bhyve/Makefile.depend
@@ -13,8 +13,10 @@ DIRDEPS = \
lib/libcompiler_rt \
lib/libsbuf \
lib/libthr \
+ lib/libucl \
lib/libutil \
lib/libvmmapi \
+ lib/libxo \
lib/libz \
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index 1c1838c2e80c..a08f58f84b22 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
@@ -137,6 +138,10 @@ struct atkbdc_softc {
struct aux_dev aux;
};
+#ifdef BHYVE_SNAPSHOT
+static struct atkbdc_softc *atkbdc_sc = NULL;
+#endif
+
static void
atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
{
@@ -548,7 +553,48 @@ atkbdc_init(struct vmctx *ctx)
sc->ps2kbd_sc = ps2kbd_init(sc);
sc->ps2mouse_sc = ps2mouse_init(sc);
+
+#ifdef BHYVE_SNAPSHOT
+ assert(atkbdc_sc == NULL);
+ atkbdc_sc = sc;
+#endif
+}
+
+#ifdef BHYVE_SNAPSHOT
+int
+atkbdc_snapshot(struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram,
+ sizeof(atkbdc_sc->ram), meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer,
+ sizeof(atkbdc_sc->kbd.buffer), meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done);
+
+ ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta);
+ if (ret != 0)
+ goto done;
+
+ ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta);
+
+done:
+ return (ret);
}
+#endif
static void
atkbdc_dsdt(void)
diff --git a/usr.sbin/bhyve/atkbdc.h b/usr.sbin/bhyve/atkbdc.h
index 85c8a7141eb2..14c00ed9ae88 100644
--- a/usr.sbin/bhyve/atkbdc.h
+++ b/usr.sbin/bhyve/atkbdc.h
@@ -30,9 +30,14 @@
#define _ATKBDC_H_
struct atkbdc_softc;
+struct vm_snapshot_meta;
struct vmctx;
void atkbdc_init(struct vmctx *ctx);
void atkbdc_event(struct atkbdc_softc *sc, int iskbd);
+#ifdef BHYVE_SNAPSHOT
+int atkbdc_snapshot(struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _ATKBDC_H_ */
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 5d329a54491e..85e5f0256fbf 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd April 22, 2020
+.Dd May 04, 2020
.Dt BHYVE 8
.Os
.Sh NAME
@@ -61,6 +61,7 @@
.Sm on
.Oc
.Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu
+.Op Fl r Ar file
.Oo Fl s
.Sm off
.Cm help | Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf
@@ -191,6 +192,21 @@ to
.Em hostcpu .
.It Fl P
Force the guest virtual CPU to exit when a PAUSE instruction is detected.
+.It Fl r Ar file
+Resume a guest from a snapshot.
+The guest memory contents are restored from
+.Ar file ,
+and the guest device and vCPU state are restored from the file
+.Dq Ar file Ns .kern .
+.Pp
+Note that the current snapshot file format requires that the configuration of
+devices in the new VM match the VM from which the snapshot was taken by specifying the
+same
+.Op Fl s
+and
+.Op Fl l
+options.
+The count of vCPUs and memory configuration are read from the snapshot.
.It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf
Configure a virtual PCI slot and function.
.Pp
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 324e40d2cda2..8d73bd38cae4 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -36,7 +36,14 @@ __FBSDID("$FreeBSD$");
#include <sys/capsicum.h>
#endif
#include <sys/mman.h>
+#ifdef BHYVE_SNAPSHOT
+#include <sys/socket.h>
+#include <sys/stat.h>
+#endif
#include <sys/time.h>
+#ifdef BHYVE_SNAPSHOT
+#include <sys/un.h>
+#endif
#include <amd64/vmm/intel/vmcs.h>
@@ -51,6 +58,9 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <err.h>
#include <errno.h>
+#ifdef BHYVE_SNAPSHOT
+#include <fcntl.h>
+#endif
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
@@ -59,6 +69,12 @@ __FBSDID("$FreeBSD$");
#include <sysexits.h>
#include <stdbool.h>
#include <stdint.h>
+#ifdef BHYVE_SNAPSHOT
+#include <ucl.h>
+#include <unistd.h>
+
+#include <libxo/xo.h>
+#endif
#include <machine/vmm.h>
#ifndef WITHOUT_CAPSICUM
@@ -83,6 +99,9 @@ __FBSDID("$FreeBSD$");
#include "pci_irq.h"
#include "pci_lpc.h"
#include "smbiostbl.h"
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
#include "xmsr.h"
#include "spinup_ap.h"
#include "rtc.h"
@@ -163,7 +182,7 @@ static const char * const vmx_exit_reason_desc[] = {
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
-char *vmname;
+const char *vmname;
int guest_ncpus;
uint16_t cores, maxcpus, sockets, threads;
@@ -229,6 +248,9 @@ usage(int code)
" -H: vmexit from the guest on hlt\n"
" -l: LPC device configuration\n"
" -m: memory size in MB\n"
+#ifdef BHYVE_SNAPSHOT
+ " -r: path to checkpoint file\n"
+#endif
" -p: pin 'vcpu' to 'hostcpu'\n"
" -P: vmexit from the guest on pause\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
@@ -388,6 +410,14 @@ paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
return (vm_map_gpa(ctx, gaddr, len));
}
+#ifdef BHYVE_SNAPSHOT
+uintptr_t
+paddr_host2guest(struct vmctx *ctx, void *addr)
+{
+ return (vm_rev_map_gpa(ctx, addr));
+}
+#endif
+
int
fbsdrun_vmexit_on_pause(void)
{
@@ -422,6 +452,9 @@ fbsdrun_start_thread(void *param)
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_add(vcpu);
+#endif
if (gdb_port != 0)
gdb_cpu_add(vcpu);
@@ -697,11 +730,15 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
stats.vmexit_mtrap++;
- if (gdb_port == 0) {
- fprintf(stderr, "vm_loop: unexpected VMEXIT_MTRAP\n");
- exit(4);
- }
- gdb_cpu_mtrap(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_suspend(*pvcpu);
+#endif
+ if (gdb_port != 0)
+ gdb_cpu_mtrap(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_resume(*pvcpu);
+#endif
+
return (VMEXIT_CONTINUE);
}
@@ -778,11 +815,14 @@ static int
vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
- if (gdb_port == 0) {
- fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
- exit(4);
- }
- gdb_cpu_suspend(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_suspend(*pvcpu);
+#endif
+ if (gdb_port != 0)
+ gdb_cpu_suspend(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_resume(*pvcpu);
+#endif
return (VMEXIT_CONTINUE);
}
@@ -997,6 +1037,22 @@ do_open(const char *vmname)
return (ctx);
}
+void
+spinup_vcpu(struct vmctx *ctx, int vcpu)
+{
+ int error;
+ uint64_t rip;
+
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ fbsdrun_set_capabilities(ctx, vcpu);
+ error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ assert(error == 0);
+
+ fbsdrun_addcpu(ctx, BSP, vcpu, rip);
+}
+
int
main(int argc, char *argv[])
{
@@ -1008,6 +1064,13 @@ main(int argc, char *argv[])
uint64_t rip;
size_t memsize;
char *optstr;
+#ifdef BHYVE_SNAPSHOT
+ char *restore_file;
+ struct restore_state rstate;
+ int vcpu;
+
+ restore_file = NULL;
+#endif
bvmcons = 0;
progname = basename(argv[0]);
@@ -1021,7 +1084,11 @@ main(int argc, char *argv[])
rtc_localtime = 1;
memflags = 0;
+#ifdef BHYVE_SNAPSHOT
+ optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:";
+#else
optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:";
+#endif
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
case 'a':
@@ -1067,6 +1134,11 @@ main(int argc, char *argv[])
"configuration '%s'", optarg);
}
break;
+#ifdef BHYVE_SNAPSHOT
+ case 'r':
+ restore_file = optarg;
+ break;
+#endif
case 's':
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
pci_print_supported_devices();
@@ -1128,12 +1200,50 @@ main(int argc, char *argv[])
argc -= optind;
argv += optind;
+#ifdef BHYVE_SNAPSHOT
+ if (argc > 1 || (argc == 0 && restore_file == NULL))
+ usage(1);
+
+ if (restore_file != NULL) {
+ error = load_restore_file(restore_file, &rstate);
+ if (error) {
+ fprintf(stderr, "Failed to read checkpoint info from "
+ "file: '%s'.\n", restore_file);
+ exit(1);
+ }
+ }
+
+ if (argc == 1) {
+ vmname = argv[0];
+ } else {
+ vmname = lookup_vmname(&rstate);
+ if (vmname == NULL) {
+ fprintf(stderr, "Cannot find VM name in restore file. "
+ "Please specify one.\n");
+ exit(1);
+ }
+ }
+#else
if (argc != 1)
usage(1);
vmname = argv[0];
+#endif
ctx = do_open(vmname);
+#ifdef BHYVE_SNAPSHOT
+ if (restore_file != NULL) {
+ guest_ncpus = lookup_guest_ncpus(&rstate);
+ memflags = lookup_memflags(&rstate);
+ memsize = lookup_memsize(&rstate);
+ }
+
+ if (guest_ncpus < 1) {
+ fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
+ exit(1);
+ }
+#endif
+
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
@@ -1200,6 +1310,40 @@ main(int argc, char *argv[])
assert(error == 0);
}
+#ifdef BHYVE_SNAPSHOT
+ if (restore_file != NULL) {
+ fprintf(stdout, "Pausing pci devs...\r\n");
+ if (vm_pause_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to pause PCI device state.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Restoring vm mem...\r\n");
+ if (restore_vm_mem(ctx, &rstate) != 0) {
+ fprintf(stderr, "Failed to restore VM memory.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Restoring pci devs...\r\n");
+ if (vm_restore_user_devs(ctx, &rstate) != 0) {
+ fprintf(stderr, "Failed to restore PCI device state.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Restoring kernel structs...\r\n");
+ if (vm_restore_kern_structs(ctx, &rstate) != 0) {
+ fprintf(stderr, "Failed to restore kernel structs.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Resuming pci devs...\r\n");
+ if (vm_resume_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to resume PCI device state.\n");
+ exit(1);
+ }
+ }
+#endif
+
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
@@ -1240,11 +1384,41 @@ main(int argc, char *argv[])
errx(EX_OSERR, "cap_enter() failed");
#endif
+#ifdef BHYVE_SNAPSHOT
+ if (restore_file != NULL)
+ destroy_restore_state(&rstate);
+
+ /*
+ * checkpointing thread for communication with bhyvectl
+ */
+ if (init_checkpoint_thread(ctx) < 0)
+ printf("Failed to start checkpoint thread!\r\n");
+
+ if (restore_file != NULL)
+ vm_restore_time(ctx);
+#endif
+
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, BSP, rip);
+#ifdef BHYVE_SNAPSHOT
+ /*
+ * If we restore a VM, start all vCPUs now (including APs), otherwise,
+ * let the guest OS to spin them up later via vmexits.
+ */
+ if (restore_file != NULL) {
+ for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
+ if (vcpu == BSP)
+ continue;
+
+ fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu);
+ spinup_vcpu(ctx, vcpu);
+ }
+ }
+#endif
+
/*
* Head off to the main event dispatch loop
*/
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index 0b23a6a5c3ae..0177baca14e9 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -38,9 +38,12 @@ struct vmctx;
extern int guest_ncpus;
extern uint16_t cores, sockets, threads;
extern char *guest_uuid_str;
-extern char *vmname;
+extern const char *vmname;
void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
+#ifdef BHYVE_SNAPSHOT
+uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr);
+#endif
void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 50e1eed12f90..4c91038ca765 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <unistd.h>
#include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
#include "bhyverun.h"
#include "debug.h"
@@ -105,9 +106,13 @@ struct blockif_ctxt {
int bc_psectsz;
int bc_psectoff;
int bc_closing;
+ int bc_paused;
+ int bc_work_count;
pthread_t bc_btid[BLOCKIF_NUMTHR];
pthread_mutex_t bc_mtx;
pthread_cond_t bc_cond;
+ pthread_cond_t bc_paused_cond;
+ pthread_cond_t bc_work_done_cond;
/* Request elements and free/pending/busy queues */
TAILQ_HEAD(, blockif_elem) bc_freeq;
@@ -210,6 +215,18 @@ blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
}
+static int
+blockif_flush_bc(struct blockif_ctxt *bc)
+{
+ if (bc->bc_ischr) {
+ if (ioctl(bc->bc_fd, DIOCGFLUSH))
+ return (errno);
+ } else if (fsync(bc->bc_fd))
+ return (errno);
+
+ return (0);
+}
+
static void
blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
{
@@ -300,11 +317,7 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
}
break;
case BOP_FLUSH:
- if (bc->bc_ischr) {
- if (ioctl(bc->bc_fd, DIOCGFLUSH))
- err = errno;
- } else if (fsync(bc->bc_fd))
- err = errno;
+ err = blockif_flush_bc(bc);
break;
case BOP_DELETE:
if (!bc->bc_candelete)
@@ -348,15 +361,30 @@ blockif_thr(void *arg)
pthread_mutex_lock(&bc->bc_mtx);
for (;;) {
- while (blockif_dequeue(bc, t, &be)) {
+ bc->bc_work_count++;
+
+ /* We cannot process work if the interface is paused */
+ while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
pthread_mutex_unlock(&bc->bc_mtx);
blockif_proc(bc, be, buf);
pthread_mutex_lock(&bc->bc_mtx);
blockif_complete(bc, be);
}
+
+ bc->bc_work_count--;
+
+ /* If none of the workers are busy, notify the main thread */
+ if (bc->bc_work_count == 0)
+ pthread_cond_broadcast(&bc->bc_work_done_cond);
+
/* Check ctxt status here to see if exit requested */
if (bc->bc_closing)
break;
+
+ /* Make all worker threads wait here if the device is paused */
+ while (bc->bc_paused)
+ pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
+
pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
@@ -565,6 +593,10 @@ blockif_open(const char *optstr, const char *ident)
bc->bc_psectoff = psectoff;
pthread_mutex_init(&bc->bc_mtx, NULL);
pthread_cond_init(&bc->bc_cond, NULL);
+ bc->bc_paused = 0;
+ bc->bc_work_count = 0;
+ pthread_cond_init(&bc->bc_paused_cond, NULL);
+ pthread_cond_init(&bc->bc_work_done_cond, NULL);
TAILQ_INIT(&bc->bc_freeq);
TAILQ_INIT(&bc->bc_pendq);
TAILQ_INIT(&bc->bc_busyq);
@@ -657,6 +689,8 @@ blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
assert(bc->bc_magic == BLOCKIF_SIG);
pthread_mutex_lock(&bc->bc_mtx);
+ /* XXX: not waiting while paused */
+
/*
* Check pending requests.
*/
@@ -855,3 +889,100 @@ blockif_candelete(struct blockif_ctxt *bc)
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_candelete);
}
+
+#ifdef BHYVE_SNAPSHOT
+void
+blockif_pause(struct blockif_ctxt *bc)
+{
+ assert(bc != NULL);
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_paused = 1;
+
+ /* The interface is paused. Wait for workers to finish their work */
+ while (bc->bc_work_count)
+ pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ if (blockif_flush_bc(bc))
+ fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
+ __func__);
+}
+
+void
+blockif_resume(struct blockif_ctxt *bc)
+{
+ assert(bc != NULL);
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_paused = 0;
+ /* resume the threads waiting for paused */
+ pthread_cond_broadcast(&bc->bc_paused_cond);
+ /* kick the threads after restore */
+ pthread_cond_broadcast(&bc->bc_cond);
+ pthread_mutex_unlock(&bc->bc_mtx);
+}
+
+int
+blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
+{
+ int i;
+ struct iovec *iov;
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
+
+ /*
+ * XXX: The callback and parameter must be filled by the virtualized
+ * device that uses the interface, during its init; we're not touching
+ * them here.
+ */
+
+ /* Snapshot the iovecs. */
+ for (i = 0; i < br->br_iovcnt; i++) {
+ iov = &br->br_iov[i];
+
+ SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
+
+ /* We assume the iov is a guest-mapped address. */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
+ false, meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+int
+blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ if (bc->bc_paused == 0) {
+ fprintf(stderr, "%s: Snapshot failed: "
+ "interface not paused.\r\n", __func__);
+ return (ENXIO);
+ }
+
+ pthread_mutex_lock(&bc->bc_mtx);
+
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
+
+done:
+ pthread_mutex_unlock(&bc->bc_mtx);
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index 75c016447ac2..f3b5b6938ef1 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -41,6 +41,9 @@
#include <sys/uio.h>
#include <sys/unistd.h>
+struct vm_snapshot_meta;
+
+
/*
* BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
* a single request. BLOCKIF_RING_MAX is the maxmimum number of
@@ -74,5 +77,13 @@ int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_close(struct blockif_ctxt *bc);
+#ifdef BHYVE_SNAPSHOT
+void blockif_pause(struct blockif_ctxt *bc);
+void blockif_resume(struct blockif_ctxt *bc);
+int blockif_snapshot_req(struct blockif_req *br,
+ struct vm_snapshot_meta *meta);
+int blockif_snapshot(struct blockif_ctxt *bc,
+ struct vm_snapshot_meta *meta);
+#endif
#endif /* _BLOCK_IF_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
index c0c69d37f311..649a6b09cb34 100644
--- a/usr.sbin/bhyve/mevent.c
+++ b/usr.sbin/bhyve/mevent.c
@@ -63,7 +63,7 @@ __FBSDID("$FreeBSD$");
#define MEVENT_MAX 64
-extern char *vmname;
+extern const char *vmname;
static pthread_t mevent_tid;
static int mevent_timid = 43;
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 23aee0fdac6b..49e6452a355d 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
#include <sys/ata.h>
#include <sys/endian.h>
+#include <machine/vmm_snapshot.h>
+
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -131,6 +133,7 @@ struct ahci_ioreq {
uint32_t done;
int slot;
int more;
+ int readop;
};
struct ahci_port {
@@ -724,6 +727,7 @@ ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
aior->slot = slot;
aior->len = len;
aior->done = done;
+ aior->readop = readop;
breq = &aior->io_req;
breq->br_offset = lba + done;
ahci_build_iov(p, aior, prdt, hdr->prdtl);
@@ -1420,6 +1424,7 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
aior->slot = slot;
aior->len = len;
aior->done = done;
+ aior->readop = 1;
breq = &aior->io_req;
breq->br_offset = lba + done;
ahci_build_iov(p, aior, prdt, hdr->prdtl);
@@ -2446,6 +2451,282 @@ pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
return (pci_ahci_init(ctx, pi, opts, 1));
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_ahci_snapshot_save_queues(struct ahci_port *port,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int idx;
+ struct ahci_ioreq *ioreq;
+
+ STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) {
+ idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+ }
+
+ idx = -1;
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+ TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) {
+ idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+ /*
+ * Snapshot only the busy requests; other requests are
+ * not valid.
+ */
+ ret = blockif_snapshot_req(&ioreq->io_req, meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to snapshot req\r\n",
+ __func__);
+ goto done;
+ }
+ }
+
+ idx = -1;
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+pci_ahci_snapshot_restore_queues(struct ahci_port *port,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int idx;
+ struct ahci_ioreq *ioreq;
+
+ /* Empty the free queue before restoring. */
+ while (!STAILQ_EMPTY(&port->iofhd))
+ STAILQ_REMOVE_HEAD(&port->iofhd, io_flist);
+
+ /* Restore the free queue. */
+ while (1) {
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+ if (idx == -1)
+ break;
+
+ STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist);
+ }
+
+ /* Restore the busy queue. */
+ while (1) {
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+ if (idx == -1)
+ break;
+
+ ioreq = &port->ioreq[idx];
+ TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist);
+
+ /*
+ * Restore only the busy requests; other requests are
+ * not valid.
+ */
+ ret = blockif_snapshot_req(&ioreq->io_req, meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to restore request\r\n",
+ __func__);
+ goto done;
+ }
+
+ /* Re-enqueue the requests in the block interface. */
+ if (ioreq->readop)
+ ret = blockif_read(port->bctx, &ioreq->io_req);
+ else
+ ret = blockif_write(port->bctx, &ioreq->io_req);
+
+ if (ret != 0) {
+ fprintf(stderr,
+ "%s: failed to re-enqueue request\r\n",
+ __func__);
+ goto done;
+ }
+ }
+
+done:
+ return (ret);
+}
+
+static int
+pci_ahci_snapshot(struct vm_snapshot_meta *meta)
+{
+ int i, j, ret;
+ void *bctx;
+ struct pci_devinst *pi;
+ struct pci_ahci_softc *sc;
+ struct ahci_port *port;
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_ioreq *ioreq;
+
+ pi = meta->dev_data;
+ sc = pi->pi_arg;
+
+ /* TODO: add mtx lock/unlock */
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done);
+
+ for (i = 0; i < MAX_PORTS; i++) {
+ port = &sc->port[i];
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ bctx = port->bctx;
+
+ SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done);
+
+ /* Mostly for restore; save is ensured by the lines above. */
+ if (((bctx == NULL) && (port->bctx != NULL)) ||
+ ((bctx != NULL) && (port->bctx == NULL))) {
+ fprintf(stderr, "%s: ports not matching\r\n", __func__);
+ ret = EINVAL;
+ goto done;
+ }
+
+ if (port->bctx == NULL)
+ continue;
+
+ if (port->port != i) {
+ fprintf(stderr, "%s: ports not matching: "
+ "actual: %d expected: %d\r\n",
+ __func__, port->port, i);
+ ret = EINVAL;
+ goto done;
+ }
+
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst,
+ AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta,
+ ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(port->ident, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
+
+ for (j = 0; j < port->ioqsz; j++) {
+ ioreq = &port->ioreq[j];
+
+ /* blockif_req snapshot done only for busy requests. */
+ hdr = (struct ahci_cmd_hdr *)(port->cmd_lst +
+ ioreq->slot * AHCI_CL_SIZE);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis,
+ 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry),
+ false, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done);
+ }
+
+ /* Perform save / restore specific operations. */
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = pci_ahci_snapshot_save_queues(port, meta);
+ if (ret != 0)
+ goto done;
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ ret = pci_ahci_snapshot_restore_queues(port, meta);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+ ret = blockif_snapshot(port->bctx, meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to restore blockif\r\n",
+ __func__);
+ goto done;
+ }
+ }
+
+done:
+ return (ret);
+}
+
+static int
+pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct pci_ahci_softc *sc;
+ struct blockif_ctxt *bctxt;
+ int i;
+
+ sc = pi->pi_arg;
+
+ for (i = 0; i < MAX_PORTS; i++) {
+ bctxt = sc->port[i].bctx;
+ if (bctxt == NULL)
+ continue;
+
+ blockif_pause(bctxt);
+ }
+
+ return (0);
+}
+
+static int
+pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct pci_ahci_softc *sc;
+ struct blockif_ctxt *bctxt;
+ int i;
+
+ sc = pi->pi_arg;
+
+ for (i = 0; i < MAX_PORTS; i++) {
+ bctxt = sc->port[i].bctx;
+ if (bctxt == NULL)
+ continue;
+
+ blockif_resume(bctxt);
+ }
+
+ return (0);
+}
+#endif
+
/*
* Use separate emulation names to distinguish drive and atapi devices
*/
@@ -2453,7 +2734,12 @@ struct pci_devemu pci_de_ahci = {
.pe_emu = "ahci",
.pe_init = pci_ahci_hd_init,
.pe_barwrite = pci_ahci_write,
- .pe_barread = pci_ahci_read
+ .pe_barread = pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_ahci_snapshot,
+ .pe_pause = pci_ahci_pause,
+ .pe_resume = pci_ahci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_ahci);
@@ -2461,7 +2747,12 @@ struct pci_devemu pci_de_ahci_hd = {
.pe_emu = "ahci-hd",
.pe_init = pci_ahci_hd_init,
.pe_barwrite = pci_ahci_write,
- .pe_barread = pci_ahci_read
+ .pe_barread = pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_ahci_snapshot,
+ .pe_pause = pci_ahci_pause,
+ .pe_resume = pci_ahci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_ahci_hd);
@@ -2469,6 +2760,11 @@ struct pci_devemu pci_de_ahci_cd = {
.pe_emu = "ahci-cd",
.pe_init = pci_ahci_atapi_init,
.pe_barwrite = pci_ahci_write,
- .pe_barread = pci_ahci_read
+ .pe_barread = pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_ahci_snapshot,
+ .pe_pause = pci_ahci_pause,
+ .pe_resume = pci_ahci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c
index dca981be85fa..c1443b6aa613 100644
--- a/usr.sbin/bhyve/pci_e82545.c
+++ b/usr.sbin/bhyve/pci_e82545.c
@@ -46,6 +46,8 @@ __FBSDID("$FreeBSD$");
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
#endif
+#include <machine/vmm_snapshot.h>
+
#include <err.h>
#include <errno.h>
#include <fcntl.h>
@@ -2378,11 +2380,168 @@ e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
return (0);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+e82545_snapshot(struct vm_snapshot_meta *meta)
+{
+ int i;
+ int ret;
+ struct e82545_softc *sc;
+ struct pci_devinst *pi;
+ uint64_t bitmap_value;
+
+ pi = meta->dev_data;
+ sc = pi->pi_arg;
+
+ /* esc_mevp and esc_mevpitr should be reinitiated at init. */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done);
+
+ /* General */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done);
+
+ /* Interrupt control */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done);
+
+ /*
+ * Transmit
+ *
+ * The fields in the unions are in superposition to access certain
+ * bytes in the larger uint variables.
+ * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1]
+ */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done);
+
+ /* Has dependency on esc_TDLEN; reoreder of fields from struct. */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN,
+ true, meta, ret, done);
+
+ /* L2 frame acceptance */
+ for (i = 0; i < nitems(sc->esc_uni); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done);
+ }
+
+ SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast),
+ meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan),
+ meta, ret, done);
+
+ /* Receive */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done);
+
+ /* Has dependency on esc_RDLEN; reoreder of fields from struct. */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN,
+ true, meta, ret, done);
+
+ /* IO Port register access */
+ SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done);
+
+ /* Shadow copy of MDIC */
+ SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done);
+
+ /* Shadow copy of EECD */
+ SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done);
+
+ /* Latest NVM in/out */
+ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done);
+
+ /* Stats */
+ SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size),
+ meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ bitmap_value = sc->nvm_bits;
+ SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done);
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ sc->nvm_bits = bitmap_value;
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ bitmap_value = sc->nvm_bits;
+ SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done);
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ sc->nvm_bits = bitmap_value;
+
+ /* EEPROM data */
+ SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data),
+ meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
struct pci_devemu pci_de_e82545 = {
.pe_emu = "e1000",
.pe_init = e82545_init,
.pe_barwrite = e82545_write,
- .pe_barread = e82545_read
+ .pe_barread = e82545_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = e82545_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_e82545);
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 145b33b5ffd2..e4b83896241e 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <stdbool.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
#include "acpi.h"
@@ -1962,6 +1963,191 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+#ifdef BHYVE_SNAPSHOT
+/*
+ * Saves/restores PCI device emulated state. Returns 0 on success.
+ */
+static int
+pci_snapshot_pci_dev(struct vm_snapshot_meta *meta)
+{
+ struct pci_devinst *pi;
+ int i;
+ int ret;
+
+ pi = meta->dev_data;
+
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata),
+ meta, ret, done);
+
+ for (i = 0; i < nitems(pi->pi_bar); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done);
+ }
+
+ /* Restore MSI-X table. */
+ for (i = 0; i < pi->pi_msix.table_count; i++) {
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control,
+ meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+static int
+pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde,
+ struct pci_devinst **pdi)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct funcinfo *fi;
+ int bus, slot, func;
+
+ assert(dev_name != NULL);
+ assert(pde != NULL);
+ assert(pdi != NULL);
+
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_name == NULL)
+ continue;
+ if (strcmp(dev_name, fi->fi_name))
+ continue;
+
+ *pde = pci_emul_finddev(fi->fi_name);
+ assert(*pde != NULL);
+
+ *pdi = fi->fi_devi;
+ return (0);
+ }
+ }
+ }
+
+ return (EINVAL);
+}
+
+int
+pci_snapshot(struct vm_snapshot_meta *meta)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(meta->dev_name != NULL);
+
+ ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi);
+ if (ret != 0) {
+ fprintf(stderr, "%s: no such name: %s\r\n",
+ __func__, meta->dev_name);
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ return (0);
+ }
+
+ meta->dev_data = pdi;
+
+ if (pde->pe_snapshot == NULL) {
+ fprintf(stderr, "%s: not implemented yet for: %s\r\n",
+ __func__, meta->dev_name);
+ return (-1);
+ }
+
+ ret = pci_snapshot_pci_dev(meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to snapshot pci dev\r\n",
+ __func__);
+ return (-1);
+ }
+
+ ret = (*pde->pe_snapshot)(meta);
+
+ return (ret);
+}
+
+int
+pci_pause(struct vmctx *ctx, const char *dev_name)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(dev_name != NULL);
+
+ ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
+ if (ret != 0) {
+ /*
+ * It is possible to call this function without
+ * checking that the device is inserted first.
+ */
+ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
+ return (0);
+ }
+
+ if (pde->pe_pause == NULL) {
+ /* The pause/resume functionality is optional. */
+ fprintf(stderr, "%s: not implemented for: %s\n",
+ __func__, dev_name);
+ return (0);
+ }
+
+ return (*pde->pe_pause)(ctx, pdi);
+}
+
+int
+pci_resume(struct vmctx *ctx, const char *dev_name)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(dev_name != NULL);
+
+ ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
+ if (ret != 0) {
+ /*
+ * It is possible to call this function without
+ * checking that the device is inserted first.
+ */
+ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
+ return (0);
+ }
+
+ if (pde->pe_resume == NULL) {
+ /* The pause/resume functionality is optional. */
+ fprintf(stderr, "%s: not implemented for: %s\n",
+ __func__, dev_name);
+ return (0);
+ }
+
+ return (*pde->pe_resume)(ctx, pdi);
+}
+#endif
+
#define PCI_EMUL_TEST
#ifdef PCI_EMUL_TEST
/*
@@ -1970,7 +2156,7 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
#define DIOSZ 8
#define DMEMSZ 4096
struct pci_emul_dsoftc {
- uint8_t ioregs[DIOSZ];
+ uint8_t ioregs[DIOSZ];
uint8_t memregs[2][DMEMSZ];
};
@@ -2062,7 +2248,7 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
} else {
printf("diow: memw unknown size %d\n", size);
}
-
+
/*
* magic interrupt ??
*/
@@ -2087,7 +2273,7 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
offset, size);
return (0);
}
-
+
value = 0;
if (size == 1) {
value = sc->ioregs[offset];
@@ -2106,7 +2292,7 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
offset, size);
return (0);
}
-
+
i = baridx - 1; /* 'memregs' index */
if (size == 1) {
@@ -2131,11 +2317,23 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
return (value);
}
+#ifdef BHYVE_SNAPSHOT
+int
+pci_emul_snapshot(struct vm_snapshot_meta *meta)
+{
+
+ return (0);
+}
+#endif
+
struct pci_devemu pci_dummy = {
.pe_emu = "dummy",
.pe_init = pci_emul_dinit,
.pe_barwrite = pci_emul_diow,
- .pe_barread = pci_emul_dior
+ .pe_barread = pci_emul_dior,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_emul_snapshot,
+#endif
};
PCI_EMUL_SET(pci_dummy);
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index fba2e8845af8..1cefa5ed042d 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -45,6 +45,7 @@
struct vmctx;
struct pci_devinst;
struct memory_region;
+struct vm_snapshot_meta;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
@@ -71,6 +72,11 @@ struct pci_devemu {
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
+
+ /* Save/restore device state */
+ int (*pe_snapshot)(struct vm_snapshot_meta *meta);
+ int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi);
+ int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
@@ -246,6 +252,11 @@ void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
+#ifdef BHYVE_SNAPSHOT
+int pci_snapshot(struct vm_snapshot_meta *meta);
+int pci_pause(struct vmctx *ctx, const char *dev_name);
+int pci_resume(struct vmctx *ctx, const char *dev_name);
+#endif
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c
index 8961875356da..0bd740a0908c 100644
--- a/usr.sbin/bhyve/pci_fbuf.c
+++ b/usr.sbin/bhyve/pci_fbuf.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mman.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
#include <stdio.h>
@@ -440,10 +441,26 @@ done:
return (error);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_fbuf_snapshot(struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err);
+
+err:
+ return (ret);
+}
+#endif
+
struct pci_devemu pci_fbuf = {
.pe_emu = "fbuf",
.pe_init = pci_fbuf_init,
.pe_barwrite = pci_fbuf_write,
- .pe_barread = pci_fbuf_read
+ .pe_barread = pci_fbuf_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_fbuf_snapshot,
+#endif
};
PCI_EMUL_SET(pci_fbuf);
diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c
index 1e4b513ec494..4ebdd7039cbc 100644
--- a/usr.sbin/bhyve/pci_lpc.c
+++ b/usr.sbin/bhyve/pci_lpc.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <stdio.h>
#include <stdlib.h>
@@ -452,12 +453,35 @@ lpc_pirq_routed(void)
pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_lpc_snapshot(struct vm_snapshot_meta *meta)
+{
+ int unit, ret;
+ struct uart_softc *sc;
+
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ sc = lpc_uart_softc[unit].uart_softc;
+
+ ret = uart_snapshot(sc, meta);
+ if (ret != 0)
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
+
struct pci_devemu pci_de_lpc = {
.pe_emu = "lpc",
.pe_init = pci_lpc_init,
.pe_write_dsdt = pci_lpc_write_dsdt,
.pe_cfgwrite = pci_lpc_cfgwrite,
.pe_barwrite = pci_lpc_write,
- .pe_barread = pci_lpc_read
+ .pe_barread = pci_lpc_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_lpc_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_lpc);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 04ff7718c333..4fd8943efffa 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -39,6 +39,8 @@ __FBSDID("$FreeBSD$");
#include <sys/ioctl.h>
#include <sys/disk.h>
+#include <machine/vmm_snapshot.h>
+
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -198,6 +200,11 @@ static void pci_vtblk_reset(void *);
static void pci_vtblk_notify(void *, struct vqueue_info *);
static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+#ifdef BHYVE_SNAPSHOT
+static void pci_vtblk_pause(void *);
+static void pci_vtblk_resume(void *);
+static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *);
+#endif
static struct virtio_consts vtblk_vi_consts = {
"vtblk", /* our name */
@@ -209,6 +216,11 @@ static struct virtio_consts vtblk_vi_consts = {
pci_vtblk_cfgwrite, /* write PCI config */
NULL, /* apply negotiated features */
VTBLK_S_HOSTCAPS, /* our capabilities */
+#ifdef BHYVE_SNAPSHOT
+ pci_vtblk_pause, /* pause blockif threads */
+ pci_vtblk_resume, /* resume blockif threads */
+ pci_vtblk_snapshot, /* save / restore device state */
+#endif
};
static void
@@ -241,6 +253,40 @@ pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
vq_endchains(&sc->vbsc_vq, 0);
}
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_vtblk_pause(void *vsc)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device pause requested !\n"));
+ blockif_pause(sc->bc);
+}
+
+static void
+pci_vtblk_resume(void *vsc)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device resume requested !\n"));
+ blockif_resume(sc->bc);
+}
+
+static int
+pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct pci_vtblk_softc *sc = vsc;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident),
+ meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
static void
pci_vtblk_done(struct blockif_req *br, int err)
{
@@ -523,6 +569,9 @@ struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_barwrite = vi_pci_write,
- .pe_barread = vi_pci_read
+ .pe_barread = vi_pci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = vi_pci_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index adc273128585..a0fcd9055e65 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
+#include <machine/vmm_snapshot.h>
#include <net/ethernet.h>
#include <net/if.h> /* IFNAMSIZ */
@@ -134,6 +135,11 @@ static void pci_vtnet_reset(void *);
static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
static void pci_vtnet_neg_features(void *, uint64_t);
+#ifdef BHYVE_SNAPSHOT
+static void pci_vtnet_pause(void *);
+static void pci_vtnet_resume(void *);
+static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *);
+#endif
static struct virtio_consts vtnet_vi_consts = {
"vtnet", /* our name */
@@ -145,6 +151,11 @@ static struct virtio_consts vtnet_vi_consts = {
pci_vtnet_cfgwrite, /* write PCI config */
pci_vtnet_neg_features, /* apply negotiated features */
VTNET_S_HOSTCAPS, /* our capabilities */
+#ifdef BHYVE_SNAPSHOT
+ pci_vtnet_pause, /* pause rx/tx threads */
+ pci_vtnet_resume, /* resume rx/tx threads */
+ pci_vtnet_snapshot, /* save / restore device state */
+#endif
};
static void
@@ -740,10 +751,80 @@ pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen);
}
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_vtnet_pause(void *vsc)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device pause requested !\n"));
+
+ /* Acquire the RX lock to block RX processing. */
+ pthread_mutex_lock(&sc->rx_mtx);
+
+ /* Wait for the transmit thread to finish its processing. */
+ pthread_mutex_lock(&sc->tx_mtx);
+ while (sc->tx_in_progress) {
+ pthread_mutex_unlock(&sc->tx_mtx);
+ usleep(10000);
+ pthread_mutex_lock(&sc->tx_mtx);
+ }
+}
+
+static void
+pci_vtnet_resume(void *vsc)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device resume requested !\n"));
+
+ pthread_mutex_unlock(&sc->tx_mtx);
+ /* The RX lock should have been acquired in vtnet_pause. */
+ pthread_mutex_unlock(&sc->rx_mtx);
+}
+
+static int
+pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device snapshot requested !\n"));
+
+ /*
+ * Queues and consts should have been saved by the more generic
+ * vi_pci_snapshot function. We need to save only our features and
+ * config.
+ */
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done);
+
+ /* Force reapply negociated features at restore time */
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ pci_vtnet_neg_features(sc, sc->vsc_features);
+ netbe_rx_enable(sc->vsc_be);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
static struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,
- .pe_barread = vi_pci_read
+ .pe_barread = vi_pci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = vi_pci_snapshot,
+ .pe_pause = vi_pci_pause,
+ .pe_resume = vi_pci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c
index 672f35c91ef8..0847d5bb38b5 100644
--- a/usr.sbin/bhyve/pci_xhci.c
+++ b/usr.sbin/bhyve/pci_xhci.c
@@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$");
#include <pthread.h>
#include <unistd.h>
+#include <machine/vmm_snapshot.h>
+
#include <dev/usb/usbdi.h>
#include <dev/usb/usb.h>
#include <dev/usb/usb_freebsd.h>
@@ -151,6 +153,8 @@ static int xhci_debug = 0;
#define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \
(((b) & ((m) << (s)))))
+#define SNAP_DEV_NAME_LEN 128
+
struct pci_xhci_trb_ring {
uint64_t ringaddr; /* current dequeue guest address */
uint32_t ccs; /* consumer cycle state */
@@ -286,9 +290,10 @@ struct pci_xhci_softc {
#define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH)
+#define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \
+ (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1)))
#define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \
- (a), \
- XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1)))
+ (a), XHCI_GADDR_SIZE(a))
static int xhci_in_use;
@@ -2855,12 +2860,265 @@ done:
return (error);
}
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[])
+{
+ int i, j;
+ struct pci_xhci_dev_emu *dev, *slot;
+
+ memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS);
+
+ for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+ for (j = 1; j <= XHCI_MAX_DEVS; j++) {
+ slot = XHCI_SLOTDEV_PTR(sc, i);
+ dev = XHCI_DEVINST_PTR(sc, j);
+
+ if (slot == dev)
+ maps[i] = j;
+ }
+ }
+}
+static int
+pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev,
+ int idx, struct vm_snapshot_meta *meta)
+{
+ int k;
+ int ret;
+ struct usb_data_xfer *xfer;
+ struct usb_data_xfer_block *xfer_block;
+
+ /* some sanity checks */
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ xfer = dev->eps[idx].ep_xfer;
+
+ SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done);
+ if (xfer == NULL) {
+ ret = 0;
+ goto done;
+ }
+
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ pci_xhci_init_ep(dev, idx);
+ xfer = dev->eps[idx].ep_xfer;
+ }
+
+ /* save / restore proper */
+ for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) {
+ xfer_block = &xfer->data[k];
+
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf,
+ XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret,
+ done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done);
+ if (xfer->ureq) {
+ /* xfer->ureq is not allocated at restore time */
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ xfer->ureq = malloc(sizeof(struct usb_device_request));
+
+ SNAPSHOT_BUF_OR_LEAVE(xfer->ureq,
+ sizeof(struct usb_device_request),
+ meta, ret, done);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+pci_xhci_snapshot(struct vm_snapshot_meta *meta)
+{
+ int i, j;
+ int ret;
+ int restore_idx;
+ struct pci_devinst *pi;
+ struct pci_xhci_softc *sc;
+ struct pci_xhci_portregs *port;
+ struct pci_xhci_dev_emu *dev;
+ char dname[SNAP_DEV_NAME_LEN];
+ int maps[XHCI_MAX_SLOTS + 1];
+
+ pi = meta->dev_data;
+ sc = pi->pi_arg;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done);
+
+ /* opregs */
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done);
+
+ /* opregs.cr_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p,
+ XHCI_GADDR_SIZE(sc->opregs.cr_p), false, meta, ret, done);
+
+ /* opregs.dcbaa_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p,
+ XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), false, meta, ret, done);
+
+ /* rtsregs */
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done);
+
+ /* rtsregs.intrreg */
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done);
+
+ /* rtsregs.erstba_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p,
+ XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), false, meta, ret, done);
+
+ /* rtsregs.erst_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p,
+ XHCI_GADDR_SIZE(sc->rtsregs.erst_p), false, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done);
+
+ /* sanity checking */
+ for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+ dev = XHCI_DEVINST_PTR(sc, i);
+ if (dev == NULL)
+ continue;
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ restore_idx = i;
+ SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done);
+
+ /* check if the restored device (when restoring) is sane */
+ if (restore_idx != i) {
+ fprintf(stderr, "%s: idx not matching: actual: %d, "
+ "expected: %d\r\n", __func__, restore_idx, i);
+ ret = EINVAL;
+ goto done;
+ }
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ memset(dname, 0, sizeof(dname));
+ strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1);
+ }
+
+ SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ dname[sizeof(dname) - 1] = '\0';
+ if (strcmp(dev->dev_ue->ue_emu, dname)) {
+ fprintf(stderr, "%s: device names mismatch: "
+ "actual: %s, expected: %s\r\n",
+ __func__, dname, dev->dev_ue->ue_emu);
+
+ ret = EINVAL;
+ goto done;
+ }
+ }
+ }
+
+ /* portregs */
+ for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+ port = XHCI_PORTREG_PTR(sc, i);
+ dev = XHCI_DEVINST_PTR(sc, i);
+
+ if (dev == NULL)
+ continue;
+
+ SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done);
+ }
+
+ /* slots */
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ pci_xhci_map_devs_slots(sc, maps);
+
+ for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+ SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ dev = XHCI_SLOTDEV_PTR(sc, i);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ if (maps[i] != 0)
+ dev = XHCI_DEVINST_PTR(sc, maps[i]);
+ else
+ dev = NULL;
+
+ XHCI_SLOTDEV_PTR(sc, i) = dev;
+ } else {
+ /* error */
+ ret = EINVAL;
+ goto done;
+ }
+
+ if (dev == NULL)
+ continue;
+
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx,
+ XHCI_GADDR_SIZE(dev->dev_ctx), false, meta, ret, done);
+
+ for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) {
+ ret = pci_xhci_snapshot_ep(sc, dev, j, meta);
+ if (ret != 0)
+ goto done;
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done);
+
+ /* devices[i]->dev_sc */
+ dev->dev_ue->ue_snapshot(dev->dev_sc, meta);
+
+ /* devices[i]->hci */
+ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
struct pci_devemu pci_de_xhci = {
.pe_emu = "xhci",
.pe_init = pci_xhci_init,
.pe_barwrite = pci_xhci_write,
- .pe_barread = pci_xhci_read
+ .pe_barread = pci_xhci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_xhci_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_xhci);
diff --git a/usr.sbin/bhyve/ps2kbd.c b/usr.sbin/bhyve/ps2kbd.c
index 3e6a1b67ca38..ef20fa47e0a9 100644
--- a/usr.sbin/bhyve/ps2kbd.c
+++ b/usr.sbin/bhyve/ps2kbd.c
@@ -32,10 +32,13 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
+#include <machine/vmm_snapshot.h>
+
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <strings.h>
#include <pthread.h>
#include <pthread_np.h>
@@ -382,3 +385,17 @@ ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
return (sc);
}
+#ifdef BHYVE_SNAPSHOT
+int
+ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
diff --git a/usr.sbin/bhyve/ps2kbd.h b/usr.sbin/bhyve/ps2kbd.h
index 17be6d046673..3cf87be1b7f3 100644
--- a/usr.sbin/bhyve/ps2kbd.h
+++ b/usr.sbin/bhyve/ps2kbd.h
@@ -32,10 +32,15 @@
#define _PS2KBD_H_
struct atkbdc_softc;
+struct vm_snapshot_meta;
struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc);
int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val);
void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val);
+#ifdef BHYVE_SNAPSHOT
+int ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _PS2KBD_H_ */
diff --git a/usr.sbin/bhyve/ps2mouse.c b/usr.sbin/bhyve/ps2mouse.c
index f42d2e726023..afe817710f30 100644
--- a/usr.sbin/bhyve/ps2mouse.c
+++ b/usr.sbin/bhyve/ps2mouse.c
@@ -32,10 +32,13 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
+#include <machine/vmm_snapshot.h>
+
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <strings.h>
#include <pthread.h>
#include <pthread_np.h>
@@ -416,4 +419,23 @@ ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
return (sc);
}
-
+#ifdef BHYVE_SNAPSHOT
+int
+ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->resolution, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->sampling_rate, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ctrlenable, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cur_x, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cur_y, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->delta_x, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->delta_y, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/ps2mouse.h b/usr.sbin/bhyve/ps2mouse.h
index 59430b01e2b1..4ae755ef4411 100644
--- a/usr.sbin/bhyve/ps2mouse.h
+++ b/usr.sbin/bhyve/ps2mouse.h
@@ -32,6 +32,7 @@
#define _PS2MOUSE_H_
struct atkbdc_softc;
+struct vm_snapshot_meta;
struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
@@ -40,4 +41,8 @@ void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert);
void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable);
int ps2mouse_fifocnt(struct ps2mouse_softc *sc);
+#ifdef BHYVE_SNAPSHOT
+int ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _PS2MOUSE_H_ */
diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c
new file mode 100644
index 000000000000..22bfd8d28a61
--- /dev/null
+++ b/usr.sbin/bhyve/snapshot.c
@@ -0,0 +1,1742 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/un.h>
+
+#include <machine/atomic.h>
+#include <machine/segments.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <signal.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+#include <stdbool.h>
+#include <sys/ioctl.h>
+
+#include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
+#include <machine/vmm_snapshot.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "atkbdc.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "fwctl.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "smbiostbl.h"
+#include "snapshot.h"
+#include "xmsr.h"
+#include "spinup_ap.h"
+#include "rtc.h"
+
+#include <libxo/xo.h>
+#include <ucl.h>
+
+struct spinner_info {
+ const size_t *crtval;
+ const size_t maxval;
+ const size_t total;
+};
+
+extern int guest_ncpus;
+
+static struct winsize winsize;
+static sig_t old_winch_handler;
+
+#define KB (1024UL)
+#define MB (1024UL * KB)
+#define GB (1024UL * MB)
+
+#define SNAPSHOT_CHUNK (4 * MB)
+#define PROG_BUF_SZ (8192)
+
+#define BHYVE_RUN_DIR "/var/run/bhyve"
+#define CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint"
+#define MAX_VMNAME 100
+
+#define MAX_MSG_SIZE 1024
+
+#define SNAPSHOT_BUFFER_SIZE (20 * MB)
+
+#define JSON_STRUCT_ARR_KEY "structs"
+#define JSON_DEV_ARR_KEY "devices"
+#define JSON_BASIC_METADATA_KEY "basic metadata"
+#define JSON_SNAPSHOT_REQ_KEY "snapshot_req"
+#define JSON_SIZE_KEY "size"
+#define JSON_FILE_OFFSET_KEY "file_offset"
+
+#define JSON_NCPUS_KEY "ncpus"
+#define JSON_VMNAME_KEY "vmname"
+#define JSON_MEMSIZE_KEY "memsize"
+#define JSON_MEMFLAGS_KEY "memflags"
+
+#define min(a,b) \
+({ \
+ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a < _b ? _a : _b; \
+ })
+
+const struct vm_snapshot_dev_info snapshot_devs[] = {
+ { "atkbdc", atkbdc_snapshot, NULL, NULL },
+ { "virtio-net", pci_snapshot, pci_pause, pci_resume },
+ { "virtio-blk", pci_snapshot, pci_pause, pci_resume },
+ { "lpc", pci_snapshot, NULL, NULL },
+ { "fbuf", pci_snapshot, NULL, NULL },
+ { "xhci", pci_snapshot, NULL, NULL },
+ { "e1000", pci_snapshot, NULL, NULL },
+ { "ahci", pci_snapshot, pci_pause, pci_resume },
+ { "ahci-hd", pci_snapshot, pci_pause, pci_resume },
+ { "ahci-cd", pci_snapshot, NULL, NULL },
+};
+
+const struct vm_snapshot_kern_info snapshot_kern_structs[] = {
+ { "vhpet", STRUCT_VHPET },
+ { "vm", STRUCT_VM },
+ { "vmx", STRUCT_VMX },
+ { "vioapic", STRUCT_VIOAPIC },
+ { "vlapic", STRUCT_VLAPIC },
+ { "vmcx", STRUCT_VMCX },
+ { "vatpit", STRUCT_VATPIT },
+ { "vatpic", STRUCT_VATPIC },
+ { "vpmtmr", STRUCT_VPMTMR },
+ { "vrtc", STRUCT_VRTC },
+};
+
+static cpuset_t vcpus_active, vcpus_suspended;
+static pthread_mutex_t vcpu_lock;
+static pthread_cond_t vcpus_idle, vcpus_can_run;
+static bool checkpoint_active;
+
+/*
+ * TODO: Harden this function and all of its callers since 'base_str' is a user
+ * provided string.
+ */
+static char *
+strcat_extension(const char *base_str, const char *ext)
+{
+ char *res;
+ size_t base_len, ext_len;
+
+ base_len = strnlen(base_str, MAX_VMNAME);
+ ext_len = strnlen(ext, MAX_VMNAME);
+
+ if (base_len + ext_len > MAX_VMNAME) {
+ fprintf(stderr, "Filename exceeds maximum length.\n");
+ return (NULL);
+ }
+
+ res = malloc(base_len + ext_len + 1);
+ if (res == NULL) {
+ perror("Failed to allocate memory.");
+ return (NULL);
+ }
+
+ memcpy(res, base_str, base_len);
+ memcpy(res + base_len, ext, ext_len);
+ res[base_len + ext_len] = 0;
+
+ return (res);
+}
+
+void
+destroy_restore_state(struct restore_state *rstate)
+{
+ if (rstate == NULL) {
+ fprintf(stderr, "Attempting to destroy NULL restore struct.\n");
+ return;
+ }
+
+ if (rstate->kdata_map != MAP_FAILED)
+ munmap(rstate->kdata_map, rstate->kdata_len);
+
+ if (rstate->kdata_fd > 0)
+ close(rstate->kdata_fd);
+ if (rstate->vmmem_fd > 0)
+ close(rstate->vmmem_fd);
+
+ if (rstate->meta_root_obj != NULL)
+ ucl_object_unref(rstate->meta_root_obj);
+ if (rstate->meta_parser != NULL)
+ ucl_parser_free(rstate->meta_parser);
+}
+
+static int
+load_vmmem_file(const char *filename, struct restore_state *rstate)
+{
+ struct stat sb;
+ int err;
+
+ rstate->vmmem_fd = open(filename, O_RDONLY);
+ if (rstate->vmmem_fd < 0) {
+ perror("Failed to open restore file");
+ return (-1);
+ }
+
+ err = fstat(rstate->vmmem_fd, &sb);
+ if (err < 0) {
+ perror("Failed to stat restore file");
+ goto err_load_vmmem;
+ }
+
+ if (sb.st_size == 0) {
+ fprintf(stderr, "Restore file is empty.\n");
+ goto err_load_vmmem;
+ }
+
+ rstate->vmmem_len = sb.st_size;
+
+ return (0);
+
+err_load_vmmem:
+ if (rstate->vmmem_fd > 0)
+ close(rstate->vmmem_fd);
+ return (-1);
+}
+
+static int
+load_kdata_file(const char *filename, struct restore_state *rstate)
+{
+ struct stat sb;
+ int err;
+
+ rstate->kdata_fd = open(filename, O_RDONLY);
+ if (rstate->kdata_fd < 0) {
+ perror("Failed to open kernel data file");
+ return (-1);
+ }
+
+ err = fstat(rstate->kdata_fd, &sb);
+ if (err < 0) {
+ perror("Failed to stat kernel data file");
+ goto err_load_kdata;
+ }
+
+ if (sb.st_size == 0) {
+ fprintf(stderr, "Kernel data file is empty.\n");
+ goto err_load_kdata;
+ }
+
+ rstate->kdata_len = sb.st_size;
+ rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ,
+ MAP_SHARED, rstate->kdata_fd, 0);
+ if (rstate->kdata_map == MAP_FAILED) {
+ perror("Failed to map restore file");
+ goto err_load_kdata;
+ }
+
+ return (0);
+
+err_load_kdata:
+ if (rstate->kdata_fd > 0)
+ close(rstate->kdata_fd);
+ return (-1);
+}
+
+static int
+load_metadata_file(const char *filename, struct restore_state *rstate)
+{
+ const ucl_object_t *obj;
+ struct ucl_parser *parser;
+ int err;
+
+ parser = ucl_parser_new(UCL_PARSER_DEFAULT);
+ if (parser == NULL) {
+ fprintf(stderr, "Failed to initialize UCL parser.\n");
+ goto err_load_metadata;
+ }
+
+ err = ucl_parser_add_file(parser, filename);
+ if (err == 0) {
+ fprintf(stderr, "Failed to parse metadata file: '%s'\n",
+ filename);
+ err = -1;
+ goto err_load_metadata;
+ }
+
+ obj = ucl_parser_get_object(parser);
+ if (obj == NULL) {
+ fprintf(stderr, "Failed to parse object.\n");
+ err = -1;
+ goto err_load_metadata;
+ }
+
+ rstate->meta_parser = parser;
+ rstate->meta_root_obj = (ucl_object_t *)obj;
+
+ return (0);
+
+err_load_metadata:
+ if (parser != NULL)
+ ucl_parser_free(parser);
+ return (err);
+}
+
+int
+load_restore_file(const char *filename, struct restore_state *rstate)
+{
+ int err = 0;
+ char *kdata_filename = NULL, *meta_filename = NULL;
+
+ assert(filename != NULL);
+ assert(rstate != NULL);
+
+ memset(rstate, 0, sizeof(*rstate));
+ rstate->kdata_map = MAP_FAILED;
+
+ err = load_vmmem_file(filename, rstate);
+ if (err != 0) {
+ fprintf(stderr, "Failed to load guest RAM file.\n");
+ goto err_restore;
+ }
+
+ kdata_filename = strcat_extension(filename, ".kern");
+ if (kdata_filename == NULL) {
+ fprintf(stderr, "Failed to construct kernel data filename.\n");
+ goto err_restore;
+ }
+
+ err = load_kdata_file(kdata_filename, rstate);
+ if (err != 0) {
+ fprintf(stderr, "Failed to load guest kernel data file.\n");
+ goto err_restore;
+ }
+
+ meta_filename = strcat_extension(filename, ".meta");
+ if (meta_filename == NULL) {
+ fprintf(stderr, "Failed to construct kernel metadata filename.\n");
+ goto err_restore;
+ }
+
+ err = load_metadata_file(meta_filename, rstate);
+ if (err != 0) {
+ fprintf(stderr, "Failed to load guest metadata file.\n");
+ goto err_restore;
+ }
+
+ return (0);
+
+err_restore:
+ destroy_restore_state(rstate);
+ if (kdata_filename != NULL)
+ free(kdata_filename);
+ if (meta_filename != NULL)
+ free(meta_filename);
+ return (-1);
+}
+
+#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \
+do { \
+ const ucl_object_t *obj__; \
+ obj__ = ucl_object_lookup(obj, key); \
+ if (obj__ == NULL) { \
+ fprintf(stderr, "Missing key: '%s'", key); \
+ return (ret); \
+ } \
+ if (!ucl_object_toint_safe(obj__, result_ptr)) { \
+ fprintf(stderr, "Cannot convert '%s' value to int.", key); \
+ return (ret); \
+ } \
+} while(0)
+
+#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \
+do { \
+ const ucl_object_t *obj__; \
+ obj__ = ucl_object_lookup(obj, key); \
+ if (obj__ == NULL) { \
+ fprintf(stderr, "Missing key: '%s'", key); \
+ return (ret); \
+ } \
+ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \
+ fprintf(stderr, "Cannot convert '%s' value to string.", key); \
+ return (ret); \
+ } \
+} while(0)
+
+static void *
+lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate,
+ size_t *struct_size)
+{
+ const ucl_object_t *structs = NULL, *obj = NULL;
+ ucl_object_iter_t it = NULL;
+ int64_t snapshot_req, size, file_offset;
+
+ structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY);
+ if (structs == NULL) {
+ fprintf(stderr, "Failed to find '%s' object.\n",
+ JSON_STRUCT_ARR_KEY);
+ return (NULL);
+ }
+
+ if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) {
+ fprintf(stderr, "Object '%s' is not an array.\n",
+ JSON_STRUCT_ARR_KEY);
+ return (NULL);
+ }
+
+ while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) {
+ snapshot_req = -1;
+ JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
+ &snapshot_req, NULL);
+ assert(snapshot_req >= 0);
+ if ((enum snapshot_req) snapshot_req == struct_id) {
+ JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
+ &size, NULL);
+ assert(size >= 0);
+
+ JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
+ &file_offset, NULL);
+ assert(file_offset >= 0);
+ assert(file_offset + size <= rstate->kdata_len);
+
+ *struct_size = (size_t)size;
+ return (rstate->kdata_map + file_offset);
+ }
+ }
+
+ return (NULL);
+}
+
+static void *
+lookup_check_dev(const char *dev_name, struct restore_state *rstate,
+ const ucl_object_t *obj, size_t *data_size)
+{
+ const char *snapshot_req;
+ int64_t size, file_offset;
+
+ snapshot_req = NULL;
+ JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
+ &snapshot_req, NULL);
+ assert(snapshot_req != NULL);
+ if (!strcmp(snapshot_req, dev_name)) {
+ JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
+ &size, NULL);
+ assert(size >= 0);
+
+ JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
+ &file_offset, NULL);
+ assert(file_offset >= 0);
+ assert(file_offset + size <= rstate->kdata_len);
+
+ *data_size = (size_t)size;
+ return (rstate->kdata_map + file_offset);
+ }
+
+ return (NULL);
+}
+
+static void*
+lookup_dev(const char *dev_name, struct restore_state *rstate,
+ size_t *data_size)
+{
+ const ucl_object_t *devs = NULL, *obj = NULL;
+ ucl_object_iter_t it = NULL;
+ void *ret;
+
+ devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY);
+ if (devs == NULL) {
+ fprintf(stderr, "Failed to find '%s' object.\n",
+ JSON_DEV_ARR_KEY);
+ return (NULL);
+ }
+
+ if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) {
+ fprintf(stderr, "Object '%s' is not an array.\n",
+ JSON_DEV_ARR_KEY);
+ return (NULL);
+ }
+
+ while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) {
+ ret = lookup_check_dev(dev_name, rstate, obj, data_size);
+ if (ret != NULL)
+ return (ret);
+ }
+
+ return (NULL);
+}
+
+static const ucl_object_t *
+lookup_basic_metadata_object(struct restore_state *rstate)
+{
+ const ucl_object_t *basic_meta_obj = NULL;
+
+ basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj,
+ JSON_BASIC_METADATA_KEY);
+ if (basic_meta_obj == NULL) {
+ fprintf(stderr, "Failed to find '%s' object.\n",
+ JSON_BASIC_METADATA_KEY);
+ return (NULL);
+ }
+
+ if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) {
+ fprintf(stderr, "Object '%s' is not a JSON object.\n",
+ JSON_BASIC_METADATA_KEY);
+ return (NULL);
+ }
+
+ return (basic_meta_obj);
+}
+
+const char *
+lookup_vmname(struct restore_state *rstate)
+{
+ const char *vmname;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (NULL);
+
+ JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL);
+ return (vmname);
+}
+
+int
+lookup_memflags(struct restore_state *rstate)
+{
+ int64_t memflags;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (0);
+
+ JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0);
+
+ return ((int)memflags);
+}
+
+size_t
+lookup_memsize(struct restore_state *rstate)
+{
+ int64_t memsize;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (0);
+
+ JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0);
+ if (memsize < 0)
+ memsize = 0;
+
+ return ((size_t)memsize);
+}
+
+
+int
+lookup_guest_ncpus(struct restore_state *rstate)
+{
+ int64_t ncpus;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (0);
+
+ JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0);
+ return ((int)ncpus);
+}
+
+static void
+winch_handler(int signal)
+{
+#ifdef TIOCGWINSZ
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
+#endif /* TIOCGWINSZ */
+}
+
+static int
+print_progress(size_t crtval, const size_t maxval)
+{
+ size_t rc;
+ double crtval_gb, maxval_gb;
+ size_t i, win_width, prog_start, prog_done, prog_end;
+ int mval_len;
+
+ static char prog_buf[PROG_BUF_SZ];
+ static const size_t len = sizeof(prog_buf);
+
+ static size_t div;
+ static char *div_str;
+
+ static char wip_bar[] = { '/', '-', '\\', '|' };
+ static int wip_idx = 0;
+
+ if (maxval == 0) {
+ printf("[0B / 0B]\r\n");
+ return (0);
+ }
+
+ if (crtval > maxval)
+ crtval = maxval;
+
+ if (maxval > 10 * GB) {
+ div = GB;
+ div_str = "GiB";
+ } else if (maxval > 10 * MB) {
+ div = MB;
+ div_str = "MiB";
+ } else {
+ div = KB;
+ div_str = "KiB";
+ }
+
+ crtval_gb = (double) crtval / div;
+ maxval_gb = (double) maxval / div;
+
+ rc = snprintf(prog_buf, len, "%.03lf", maxval_gb);
+ if (rc == len) {
+ fprintf(stderr, "Maxval too big\n");
+ return (-1);
+ }
+ mval_len = rc;
+
+ rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |",
+ mval_len, crtval_gb, div_str, maxval_gb, div_str);
+
+ if (rc == len) {
+ fprintf(stderr, "Buffer too small to print progress\n");
+ return (-1);
+ }
+
+ win_width = min(winsize.ws_col, len);
+ prog_start = rc;
+
+ if (prog_start < (win_width - 2)) {
+ prog_end = win_width - prog_start - 2;
+ prog_done = prog_end * (crtval_gb / maxval_gb);
+
+ for (i = prog_start; i < prog_start + prog_done; i++)
+ prog_buf[i] = '#';
+
+ if (crtval != maxval) {
+ prog_buf[i] = wip_bar[wip_idx];
+ wip_idx = (wip_idx + 1) % sizeof(wip_bar);
+ i++;
+ } else {
+ prog_buf[i++] = '#';
+ }
+
+ for (; i < win_width - 2; i++)
+ prog_buf[i] = '_';
+
+ prog_buf[win_width - 2] = '|';
+ }
+
+ prog_buf[win_width - 1] = '\0';
+ write(STDOUT_FILENO, prog_buf, win_width);
+
+ return (0);
+}
+
+static void *
+snapshot_spinner_cb(void *arg)
+{
+ int rc;
+ size_t crtval, maxval, total;
+ struct spinner_info *si;
+ struct timespec ts;
+
+ si = arg;
+ if (si == NULL)
+ pthread_exit(NULL);
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */
+
+ do {
+ crtval = *si->crtval;
+ maxval = si->maxval;
+ total = si->total;
+
+ rc = print_progress(crtval, total);
+ if (rc < 0) {
+ fprintf(stderr, "Failed to parse progress\n");
+ break;
+ }
+
+ nanosleep(&ts, NULL);
+ } while (crtval < maxval);
+
+ pthread_exit(NULL);
+ return NULL;
+}
+
+static int
+vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src,
+ const size_t len, const size_t totalmem, const bool op_wr)
+{
+ int rc;
+ size_t part_done, todo, rem;
+ ssize_t done;
+ bool show_progress;
+ pthread_t spinner_th;
+ struct spinner_info *si;
+
+ if (lseek(snapfd, foff, SEEK_SET) < 0) {
+ perror("Failed to change file offset");
+ return (-1);
+ }
+
+ show_progress = false;
+ if (isatty(STDIN_FILENO) && (winsize.ws_col != 0))
+ show_progress = true;
+
+ part_done = foff;
+ rem = len;
+
+ if (show_progress) {
+ si = &(struct spinner_info) {
+ .crtval = &part_done,
+ .maxval = foff + len,
+ .total = totalmem
+ };
+
+ rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si);
+ if (rc) {
+ perror("Unable to create spinner thread");
+ show_progress = false;
+ }
+ }
+
+ while (rem > 0) {
+ if (show_progress)
+ todo = min(SNAPSHOT_CHUNK, rem);
+ else
+ todo = rem;
+
+ if (op_wr)
+ done = write(snapfd, src, todo);
+ else
+ done = read(snapfd, src, todo);
+ if (done < 0) {
+ perror("Failed to write in file");
+ return (-1);
+ }
+
+ src += done;
+ part_done += done;
+ rem -= done;
+ }
+
+ if (show_progress) {
+ rc = pthread_join(spinner_th, NULL);
+ if (rc)
+ perror("Unable to end spinner thread");
+ }
+
+ return (0);
+}
+
+static size_t
+vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr)
+{
+ int ret;
+ size_t lowmem, highmem, totalmem;
+ char *baseaddr;
+
+ ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem);
+ if (ret) {
+ fprintf(stderr, "%s: unable to retrieve guest memory size\r\n",
+ __func__);
+ return (0);
+ }
+ totalmem = lowmem + highmem;
+
+ if ((op_wr == false) && (totalmem != memsz)) {
+ fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n",
+ __func__, totalmem, memsz);
+ return (0);
+ }
+
+ winsize.ws_col = 80;
+#ifdef TIOCGWINSZ
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
+#endif /* TIOCGWINSZ */
+ old_winch_handler = signal(SIGWINCH, winch_handler);
+
+ ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem,
+ totalmem, op_wr);
+ if (ret) {
+ fprintf(stderr, "%s: Could not %s lowmem\r\n",
+ __func__, op_wr ? "write" : "read");
+ totalmem = 0;
+ goto done;
+ }
+
+ if (highmem == 0)
+ goto done;
+
+ ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB,
+ highmem, totalmem, op_wr);
+ if (ret) {
+ fprintf(stderr, "%s: Could not %s highmem\r\n",
+ __func__, op_wr ? "write" : "read");
+ totalmem = 0;
+ goto done;
+ }
+
+done:
+ printf("\r\n");
+ signal(SIGWINCH, old_winch_handler);
+
+ return (totalmem);
+}
+
+int
+restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate)
+{
+ size_t restored;
+
+ restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len,
+ false);
+
+ if (restored != rstate->vmmem_len)
+ return (-1);
+
+ return (0);
+}
+
+static int
+vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate,
+ const struct vm_snapshot_kern_info *info)
+{
+ void *struct_ptr;
+ size_t struct_size;
+ int ret;
+ struct vm_snapshot_meta *meta;
+
+ struct_ptr = lookup_struct(info->req, rstate, &struct_size);
+ if (struct_ptr == NULL) {
+ fprintf(stderr, "%s: Failed to lookup struct %s\r\n",
+ __func__, info->struct_name);
+ ret = -1;
+ goto done;
+ }
+
+ if (struct_size == 0) {
+ fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n",
+ __func__, info->struct_name);
+ ret = -1;
+ goto done;
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+ .dev_name = info->struct_name,
+ .dev_req = info->req,
+
+ .buffer.buf_start = struct_ptr,
+ .buffer.buf_size = struct_size,
+
+ .buffer.buf = struct_ptr,
+ .buffer.buf_rem = struct_size,
+
+ .op = VM_SNAPSHOT_RESTORE,
+ };
+
+ ret = vm_snapshot_req(meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: Failed to restore struct: %s\r\n",
+ __func__, info->struct_name);
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_kern_structs); i++) {
+ ret = vm_restore_kern_struct(ctx, rstate,
+ &snapshot_kern_structs[i]);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate,
+ const struct vm_snapshot_dev_info *info)
+{
+ void *dev_ptr;
+ size_t dev_size;
+ int ret;
+ struct vm_snapshot_meta *meta;
+
+ dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size);
+ if (dev_ptr == NULL) {
+ fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name);
+ fprintf(stderr, "Continuing the restore/migration process\r\n");
+ return (0);
+ }
+
+ if (dev_size == 0) {
+ fprintf(stderr, "%s: Device size is 0. "
+ "Assuming %s is not used\r\n",
+ __func__, info->dev_name);
+ return (0);
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+ .dev_name = info->dev_name,
+
+ .buffer.buf_start = dev_ptr,
+ .buffer.buf_size = dev_size,
+
+ .buffer.buf = dev_ptr,
+ .buffer.buf_rem = dev_size,
+
+ .op = VM_SNAPSHOT_RESTORE,
+ };
+
+ ret = (*info->snapshot_cb)(meta);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to restore dev: %s\r\n",
+ info->dev_name);
+ return (-1);
+ }
+
+ return (0);
+}
+
+
+int
+vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return 0;
+}
+
+int
+vm_pause_user_devs(struct vmctx *ctx)
+{
+ const struct vm_snapshot_dev_info *info;
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ info = &snapshot_devs[i];
+ if (info->pause_cb == NULL)
+ continue;
+
+ ret = info->pause_cb(ctx, info->dev_name);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+vm_resume_user_devs(struct vmctx *ctx)
+{
+ const struct vm_snapshot_dev_info *info;
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ info = &snapshot_devs[i];
+ if (info->resume_cb == NULL)
+ continue;
+
+ ret = info->resume_cb(ctx, info->dev_name);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key,
+ struct vm_snapshot_meta *meta, off_t *offset)
+{
+ int ret;
+ size_t data_size;
+ ssize_t write_cnt;
+
+ ret = vm_snapshot_req(meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: Failed to snapshot struct %s\r\n",
+ __func__, meta->dev_name);
+ ret = -1;
+ goto done;
+ }
+
+ data_size = vm_get_snapshot_size(meta);
+
+ write_cnt = write(data_fd, meta->buffer.buf_start, data_size);
+ if (write_cnt != data_size) {
+ perror("Failed to write all snapshotted data.");
+ ret = -1;
+ goto done;
+ }
+
+ /* Write metadata. */
+ xo_open_instance_h(xop, array_key);
+ xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name);
+ xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n",
+ meta->dev_req);
+ xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
+ xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
+ xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY);
+
+ *offset += data_size;
+
+done:
+ return (ret);
+}
+
+static int
+vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+{
+ int ret, i, error;
+ size_t offset, buf_size;
+ char *buffer;
+ struct vm_snapshot_meta *meta;
+
+ error = 0;
+ offset = 0;
+ buf_size = SNAPSHOT_BUFFER_SIZE;
+
+ buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char));
+ if (buffer == NULL) {
+ error = ENOMEM;
+ perror("Failed to allocate memory for snapshot buffer");
+ goto err_vm_snapshot_kern_data;
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+
+ .buffer.buf_start = buffer,
+ .buffer.buf_size = buf_size,
+
+ .op = VM_SNAPSHOT_SAVE,
+ };
+
+ xo_open_list_h(xop, JSON_STRUCT_ARR_KEY);
+ for (i = 0; i < nitems(snapshot_kern_structs); i++) {
+ meta->dev_name = snapshot_kern_structs[i].struct_name;
+ meta->dev_req = snapshot_kern_structs[i].req;
+
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY,
+ meta, &offset);
+ if (ret != 0) {
+ error = -1;
+ goto err_vm_snapshot_kern_data;
+ }
+ }
+ xo_close_list_h(xop, JSON_STRUCT_ARR_KEY);
+
+err_vm_snapshot_kern_data:
+ if (buffer != NULL)
+ free(buffer);
+ return (error);
+}
+
+static int
+vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz)
+{
+ int error;
+ int memflags;
+ char vmname_buf[MAX_VMNAME];
+
+ memset(vmname_buf, 0, MAX_VMNAME);
+ error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+ if (error != 0) {
+ perror("Failed to get VM name");
+ goto err;
+ }
+
+ memflags = vm_get_memflags(ctx);
+
+ xo_open_container_h(xop, JSON_BASIC_METADATA_KEY);
+ xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus);
+ xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf);
+ xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz);
+ xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags);
+ xo_close_container_h(xop, JSON_BASIC_METADATA_KEY);
+
+err:
+ return (error);
+}
+
+static int
+vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key,
+ struct vm_snapshot_meta *meta, off_t *offset)
+{
+ int ret;
+ size_t data_size;
+
+ data_size = vm_get_snapshot_size(meta);
+
+ ret = write(data_fd, meta->buffer.buf_start, data_size);
+ if (ret != data_size) {
+ perror("Failed to write all snapshotted data.");
+ return (-1);
+ }
+
+ /* Write metadata. */
+ xo_open_instance_h(xop, array_key);
+ xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name);
+ xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
+ xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
+ xo_close_instance_h(xop, array_key);
+
+ *offset += data_size;
+
+ return (0);
+}
+
+static int
+vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info,
+ int data_fd, xo_handle_t *xop,
+ struct vm_snapshot_meta *meta, off_t *offset)
+{
+ int ret;
+
+ ret = (*info->snapshot_cb)(meta);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n",
+ meta->dev_name, ret);
+ return (ret);
+ }
+
+ ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta,
+ offset);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+}
+
+static int
+vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+{
+ int ret, i;
+ off_t offset;
+ void *buffer;
+ size_t buf_size;
+ struct vm_snapshot_meta *meta;
+
+ buf_size = SNAPSHOT_BUFFER_SIZE;
+
+ offset = lseek(data_fd, 0, SEEK_CUR);
+ if (offset < 0) {
+ perror("Failed to get data file current offset.");
+ return (-1);
+ }
+
+ buffer = malloc(buf_size);
+ if (buffer == NULL) {
+ perror("Failed to allocate memory for snapshot buffer");
+ ret = ENOSPC;
+ goto snapshot_err;
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+
+ .buffer.buf_start = buffer,
+ .buffer.buf_size = buf_size,
+
+ .op = VM_SNAPSHOT_SAVE,
+ };
+
+ xo_open_list_h(xop, JSON_DEV_ARR_KEY);
+
+ /* Restore other devices that support this feature */
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ meta->dev_name = snapshot_devs[i].dev_name;
+
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop,
+ meta, &offset);
+ if (ret != 0)
+ goto snapshot_err;
+ }
+
+ xo_close_list_h(xop, JSON_DEV_ARR_KEY);
+
+snapshot_err:
+ if (buffer != NULL)
+ free(buffer);
+ return (ret);
+}
+
+void
+checkpoint_cpu_add(int vcpu)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ CPU_SET(vcpu, &vcpus_active);
+
+ if (checkpoint_active) {
+ CPU_SET(vcpu, &vcpus_suspended);
+ while (checkpoint_active)
+ pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
+ CPU_CLR(vcpu, &vcpus_suspended);
+ }
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+/*
+ * When a vCPU is suspended for any reason, it calls
+ * checkpoint_cpu_suspend(). This records that the vCPU is idle.
+ * Before returning from suspension, checkpoint_cpu_resume() is
+ * called. In suspend we note that the vCPU is idle. In resume we
+ * pause the vCPU thread until the checkpoint is complete. The reason
+ * for the two-step process is that vCPUs might already be stopped in
+ * the debug server when a checkpoint is requested. This approach
+ * allows us to account for and handle those vCPUs.
+ */
+void
+checkpoint_cpu_suspend(int vcpu)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ CPU_SET(vcpu, &vcpus_suspended);
+ if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0)
+ pthread_cond_signal(&vcpus_idle);
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+void
+checkpoint_cpu_resume(int vcpu)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ while (checkpoint_active)
+ pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
+ CPU_CLR(vcpu, &vcpus_suspended);
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+static void
+vm_vcpu_pause(struct vmctx *ctx)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ checkpoint_active = true;
+ vm_suspend_cpu(ctx, -1);
+ while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0)
+ pthread_cond_wait(&vcpus_idle, &vcpu_lock);
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+static void
+vm_vcpu_resume(struct vmctx *ctx)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ checkpoint_active = false;
+ pthread_mutex_unlock(&vcpu_lock);
+ vm_resume_cpu(ctx, -1);
+ pthread_cond_broadcast(&vcpus_can_run);
+}
+
+static int
+vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm)
+{
+ int fd_checkpoint = 0, kdata_fd = 0;
+ int ret = 0;
+ int error = 0;
+ size_t memsz;
+ xo_handle_t *xop = NULL;
+ char *meta_filename = NULL;
+ char *kdata_filename = NULL;
+ FILE *meta_file = NULL;
+
+ kdata_filename = strcat_extension(checkpoint_file, ".kern");
+ if (kdata_filename == NULL) {
+ fprintf(stderr, "Failed to construct kernel data filename.\n");
+ return (-1);
+ }
+
+ kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700);
+ if (kdata_fd < 0) {
+ perror("Failed to open kernel data snapshot file.");
+ error = -1;
+ goto done;
+ }
+
+ fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700);
+
+ if (fd_checkpoint < 0) {
+ perror("Failed to create checkpoint file");
+ error = -1;
+ goto done;
+ }
+
+ meta_filename = strcat_extension(checkpoint_file, ".meta");
+ if (meta_filename == NULL) {
+ fprintf(stderr, "Failed to construct vm metadata filename.\n");
+ goto done;
+ }
+
+ meta_file = fopen(meta_filename, "w");
+ if (meta_file == NULL) {
+ perror("Failed to open vm metadata snapshot file.");
+ goto done;
+ }
+
+ xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY);
+ if (xop == NULL) {
+ perror("Failed to get libxo handle on metadata file.");
+ goto done;
+ }
+
+ vm_vcpu_pause(ctx);
+
+ ret = vm_pause_user_devs(ctx);
+ if (ret != 0) {
+ fprintf(stderr, "Could not pause devices\r\n");
+ error = ret;
+ goto done;
+ }
+
+ memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true);
+ if (memsz == 0) {
+ perror("Could not write guest memory to file");
+ error = -1;
+ goto done;
+ }
+
+ ret = vm_snapshot_basic_metadata(ctx, xop, memsz);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot vm basic metadata.\n");
+ error = -1;
+ goto done;
+ }
+
+
+ ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot vm kernel data.\n");
+ error = -1;
+ goto done;
+ }
+
+ ret = vm_snapshot_user_devs(ctx, kdata_fd, xop);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot device state.\n");
+ error = -1;
+ goto done;
+ }
+
+ xo_finish_h(xop);
+
+ if (stop_vm) {
+ vm_destroy(ctx);
+ exit(0);
+ }
+
+done:
+ ret = vm_resume_user_devs(ctx);
+ if (ret != 0)
+ fprintf(stderr, "Could not resume devices\r\n");
+ vm_vcpu_resume(ctx);
+ if (fd_checkpoint > 0)
+ close(fd_checkpoint);
+ if (meta_filename != NULL)
+ free(meta_filename);
+ if (kdata_filename != NULL)
+ free(kdata_filename);
+ if (xop != NULL)
+ xo_destroy(xop);
+ if (meta_file != NULL)
+ fclose(meta_file);
+ if (kdata_fd > 0)
+ close(kdata_fd);
+ return (error);
+}
+
+int
+get_checkpoint_msg(int conn_fd, struct vmctx *ctx)
+{
+ unsigned char buf[MAX_MSG_SIZE];
+ struct checkpoint_op *checkpoint_op;
+ int len, recv_len, total_recv = 0;
+ int err = 0;
+
+ len = sizeof(struct checkpoint_op); /* expected length */
+ while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) {
+ total_recv += recv_len;
+ }
+ if (recv_len < 0) {
+ perror("Error while receiving data from bhyvectl");
+ err = -1;
+ goto done;
+ }
+
+ checkpoint_op = (struct checkpoint_op *)buf;
+ switch (checkpoint_op->op) {
+ case START_CHECKPOINT:
+ err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false);
+ break;
+ case START_SUSPEND:
+ err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true);
+ break;
+ default:
+ fprintf(stderr, "Unrecognized checkpoint operation.\n");
+ err = -1;
+ }
+
+done:
+ close(conn_fd);
+ return (err);
+}
+
+/*
+ * Listen for commands from bhyvectl
+ */
+void *
+checkpoint_thread(void *param)
+{
+ struct checkpoint_thread_info *thread_info;
+ int conn_fd, ret;
+
+ pthread_set_name_np(pthread_self(), "checkpoint thread");
+ thread_info = (struct checkpoint_thread_info *)param;
+
+ while ((conn_fd = accept(thread_info->socket_fd, NULL, NULL)) > -1) {
+ ret = get_checkpoint_msg(conn_fd, thread_info->ctx);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to read message on checkpoint "
+ "socket. Retrying.\n");
+ }
+ }
+ if (conn_fd < -1) {
+ perror("Failed to accept connection");
+ }
+
+ return (NULL);
+}
+
+/*
+ * Create directory tree to store runtime specific information:
+ * i.e. UNIX sockets for IPC with bhyvectl.
+ */
+static int
+make_checkpoint_dir(void)
+{
+ int err;
+
+ err = mkdir(BHYVE_RUN_DIR, 0755);
+ if (err < 0 && errno != EEXIST)
+ return (err);
+
+ err = mkdir(CHECKPOINT_RUN_DIR, 0755);
+ if (err < 0 && errno != EEXIST)
+ return (err);
+
+ return 0;
+}
+
+/*
+ * Create the listening socket for IPC with bhyvectl
+ */
+int
+init_checkpoint_thread(struct vmctx *ctx)
+{
+ struct checkpoint_thread_info *checkpoint_info = NULL;
+ struct sockaddr_un addr;
+ int socket_fd;
+ pthread_t checkpoint_pthread;
+ char vmname_buf[MAX_VMNAME];
+ int ret, err = 0;
+
+ memset(&addr, 0, sizeof(addr));
+
+ err = pthread_mutex_init(&vcpu_lock, NULL);
+ if (err != 0)
+ errc(1, err, "checkpoint mutex init");
+ err = pthread_cond_init(&vcpus_idle, NULL);
+ if (err == 0)
+ err = pthread_cond_init(&vcpus_can_run, NULL);
+ if (err != 0)
+ errc(1, err, "checkpoint cv init");
+
+ socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (socket_fd < 0) {
+ perror("Socket creation failed (IPC with bhyvectl");
+ err = -1;
+ goto fail;
+ }
+
+ err = make_checkpoint_dir();
+ if (err < 0) {
+ perror("Failed to create checkpoint runtime directory");
+ goto fail;
+ }
+
+ addr.sun_family = AF_UNIX;
+
+ err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+ if (err != 0) {
+ perror("Failed to get VM name");
+ goto fail;
+ }
+
+ snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s",
+ CHECKPOINT_RUN_DIR, vmname_buf);
+ addr.sun_len = SUN_LEN(&addr);
+ unlink(addr.sun_path);
+
+ if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) {
+ perror("Failed to bind socket (IPC with bhyvectl)");
+ err = -1;
+ goto fail;
+ }
+
+ if (listen(socket_fd, 10) < 0) {
+ perror("Failed to listen on socket (IPC with bhyvectl)");
+ err = -1;
+ goto fail;
+ }
+
+ checkpoint_info = calloc(1, sizeof(*checkpoint_info));
+ checkpoint_info->ctx = ctx;
+ checkpoint_info->socket_fd = socket_fd;
+
+ ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread,
+ checkpoint_info);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+
+ return (0);
+fail:
+ free(checkpoint_info);
+ if (socket_fd > 0)
+ close(socket_fd);
+ unlink(addr.sun_path);
+
+ return (err);
+}
+
+void
+vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op)
+{
+ const char *__op;
+
+ if (op == VM_SNAPSHOT_SAVE)
+ __op = "save";
+ else if (op == VM_SNAPSHOT_RESTORE)
+ __op = "restore";
+ else
+ __op = "unknown";
+
+ fprintf(stderr, "%s: snapshot-%s failed for %s\r\n",
+ __func__, __op, bufname);
+}
+
+int
+vm_snapshot_buf(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta)
+{
+ struct vm_snapshot_buffer *buffer;
+ int op;
+
+ buffer = &meta->buffer;
+ op = meta->op;
+
+ if (buffer->buf_rem < data_size) {
+ fprintf(stderr, "%s: buffer too small\r\n", __func__);
+ return (E2BIG);
+ }
+
+ if (op == VM_SNAPSHOT_SAVE)
+ memcpy(buffer->buf, (uint8_t *) data, data_size);
+ else if (op == VM_SNAPSHOT_RESTORE)
+ memcpy((uint8_t *) data, buffer->buf, data_size);
+ else
+ return (EINVAL);
+
+ buffer->buf += data_size;
+ buffer->buf_rem -= data_size;
+
+ return (0);
+}
+
+size_t
+vm_get_snapshot_size(struct vm_snapshot_meta *meta)
+{
+ size_t length;
+ struct vm_snapshot_buffer *buffer;
+
+ buffer = &meta->buffer;
+
+ if (buffer->buf_size < buffer->buf_rem) {
+ fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n",
+ __func__, buffer->buf_size, buffer->buf_rem);
+ length = 0;
+ } else {
+ length = buffer->buf_size - buffer->buf_rem;
+ }
+
+ return (length);
+}
+
+int
+vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ vm_paddr_t gaddr;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ gaddr = paddr_host2guest(meta->ctx, *addrp);
+ if (gaddr == (vm_paddr_t) -1) {
+ if (!restore_null ||
+ (restore_null && (*addrp != NULL))) {
+ ret = EFAULT;
+ goto done;
+ }
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
+ if (gaddr == (vm_paddr_t) -1) {
+ if (!restore_null) {
+ ret = EFAULT;
+ goto done;
+ }
+ }
+
+ *addrp = paddr_guest2host(meta->ctx, gaddr, len);
+ } else {
+ ret = EINVAL;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta)
+{
+ struct vm_snapshot_buffer *buffer;
+ int op;
+ int ret;
+
+ buffer = &meta->buffer;
+ op = meta->op;
+
+ if (buffer->buf_rem < data_size) {
+ fprintf(stderr, "%s: buffer too small\r\n", __func__);
+ ret = E2BIG;
+ goto done;
+ }
+
+ if (op == VM_SNAPSHOT_SAVE) {
+ ret = 0;
+ memcpy(buffer->buf, (uint8_t *) data, data_size);
+ } else if (op == VM_SNAPSHOT_RESTORE) {
+ ret = memcmp((uint8_t *) data, buffer->buf, data_size);
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+ buffer->buf += data_size;
+ buffer->buf_rem -= data_size;
+
+done:
+ return (ret);
+}
diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h
new file mode 100644
index 000000000000..f9ea3d573089
--- /dev/null
+++ b/usr.sbin/bhyve/snapshot.h
@@ -0,0 +1,105 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVE_SNAPSHOT_
+#define _BHYVE_SNAPSHOT_
+
+#include <machine/vmm_snapshot.h>
+#include <libxo/xo.h>
+#include <ucl.h>
+
+struct vmctx;
+
+struct restore_state {
+ int kdata_fd;
+ int vmmem_fd;
+
+ void *kdata_map;
+ size_t kdata_len;
+
+ size_t vmmem_len;
+
+ struct ucl_parser *meta_parser;
+ ucl_object_t *meta_root_obj;
+};
+
+struct checkpoint_thread_info {
+ struct vmctx *ctx;
+ int socket_fd;
+};
+
+typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *);
+typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *);
+typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *);
+
+struct vm_snapshot_dev_info {
+ const char *dev_name; /* device name */
+ vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */
+ vm_pause_dev_cb pause_cb; /* callback for device pause */
+ vm_resume_dev_cb resume_cb; /* callback for device resume */
+};
+
+struct vm_snapshot_kern_info {
+ const char *struct_name; /* kernel structure name*/
+ enum snapshot_req req; /* request type */
+};
+
+void destroy_restore_state(struct restore_state *rstate);
+
+const char *lookup_vmname(struct restore_state *rstate);
+int lookup_memflags(struct restore_state *rstate);
+size_t lookup_memsize(struct restore_state *rstate);
+int lookup_guest_ncpus(struct restore_state *rstate);
+
+void checkpoint_cpu_add(int vcpu);
+void checkpoint_cpu_resume(int vcpu);
+void checkpoint_cpu_suspend(int vcpu);
+
+int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate);
+int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate);
+
+int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate);
+int vm_pause_user_devs(struct vmctx *ctx);
+int vm_resume_user_devs(struct vmctx *ctx);
+
+int get_checkpoint_msg(int conn_fd, struct vmctx *ctx);
+void *checkpoint_thread(void *param);
+int init_checkpoint_thread(struct vmctx *ctx);
+
+int load_restore_file(const char *filename, struct restore_state *rstate);
+
+#endif
diff --git a/usr.sbin/bhyve/uart_emul.c b/usr.sbin/bhyve/uart_emul.c
index 930344a52935..a89974590a1f 100644
--- a/usr.sbin/bhyve/uart_emul.c
+++ b/usr.sbin/bhyve/uart_emul.c
@@ -39,6 +39,8 @@ __FBSDID("$FreeBSD$");
#include <capsicum_helpers.h>
#endif
+#include <machine/vmm_snapshot.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
@@ -719,3 +721,35 @@ uart_set_backend(struct uart_softc *sc, const char *opts)
return (retval);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf),
+ meta, ret, done);
+
+ sc->thre_int_pending = 1;
+
+done:
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/uart_emul.h b/usr.sbin/bhyve/uart_emul.h
index a87202df1f96..5a53294da89e 100644
--- a/usr.sbin/bhyve/uart_emul.h
+++ b/usr.sbin/bhyve/uart_emul.h
@@ -31,10 +31,10 @@
#ifndef _UART_EMUL_H_
#define _UART_EMUL_H_
-
#define UART_IO_BAR_SIZE 8
struct uart_softc;
+struct vm_snapshot_meta;
typedef void (*uart_intr_func_t)(void *arg);
struct uart_softc *uart_init(uart_intr_func_t intr_assert,
@@ -44,4 +44,7 @@ int uart_legacy_alloc(int unit, int *ioaddr, int *irq);
uint8_t uart_read(struct uart_softc *sc, int offset);
void uart_write(struct uart_softc *sc, int offset, uint8_t value);
int uart_set_backend(struct uart_softc *sc, const char *opt);
+#ifdef BHYVE_SNAPSHOT
+int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta);
+#endif
#endif
diff --git a/usr.sbin/bhyve/usb_emul.h b/usr.sbin/bhyve/usb_emul.h
index c52411dd0650..d6e1e616cd82 100644
--- a/usr.sbin/bhyve/usb_emul.h
+++ b/usr.sbin/bhyve/usb_emul.h
@@ -41,10 +41,10 @@
#define USB_XFER_IN 1
-
struct usb_hci;
struct usb_device_request;
struct usb_data_xfer;
+struct vm_snapshot_meta;
/* Device emulation handlers */
struct usb_devemu {
@@ -62,6 +62,7 @@ struct usb_devemu {
int (*ue_reset)(void *sc);
int (*ue_remove)(void *sc);
int (*ue_stop)(void *sc);
+ int (*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta);
};
#define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x);
@@ -148,7 +149,6 @@ enum USB_ERRCODE {
pthread_mutex_unlock(&((x)->mtx)); \
} while (0)
-
struct usb_devemu *usb_emu_finddev(char *name);
struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer,
diff --git a/usr.sbin/bhyve/usb_mouse.c b/usr.sbin/bhyve/usb_mouse.c
index da3800c11fc2..5398da818c7f 100644
--- a/usr.sbin/bhyve/usb_mouse.c
+++ b/usr.sbin/bhyve/usb_mouse.c
@@ -31,6 +31,8 @@ __FBSDID("$FreeBSD$");
#include <sys/time.h>
+#include <machine/vmm_snapshot.h>
+
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
@@ -788,6 +790,29 @@ umouse_stop(void *scarg)
return (0);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct umouse_softc *sc;
+
+ sc = scarg;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
struct usb_devemu ue_mouse = {
.ue_emu = "tablet",
@@ -798,6 +823,9 @@ struct usb_devemu ue_mouse = {
.ue_data = umouse_data_handler,
.ue_reset = umouse_reset,
.ue_remove = umouse_remove,
- .ue_stop = umouse_stop
+ .ue_stop = umouse_stop,
+#ifdef BHYVE_SNAPSHOT
+ .ue_snapshot = umouse_snapshot,
+#endif
};
USB_EMUL_SET(ue_mouse);
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index d899a5779570..f3deb72b081c 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/uio.h>
#include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
#include <stdio.h>
#include <stdint.h>
@@ -806,3 +807,150 @@ done:
if (vs->vs_mtx)
pthread_mutex_unlock(vs->vs_mtx);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct virtio_softc *vs;
+ struct virtio_consts *vc;
+
+ vs = pi->pi_arg;
+ vc = vs->vs_vc;
+
+ vc = vs->vs_vc;
+ assert(vc->vc_pause != NULL);
+ (*vc->vc_pause)(DEV_SOFTC(vs));
+
+ return (0);
+}
+
+int
+vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct virtio_softc *vs;
+ struct virtio_consts *vc;
+
+ vs = pi->pi_arg;
+ vc = vs->vs_vc;
+
+ vc = vs->vs_vc;
+ assert(vc->vc_resume != NULL);
+ (*vc->vc_resume)(DEV_SOFTC(vs));
+
+ return (0);
+}
+
+static int
+vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done);
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done);
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
+{
+ int i;
+ int ret;
+ struct virtio_consts *vc;
+ struct vqueue_info *vq;
+ uint64_t addr_size;
+
+ vc = vs->vs_vc;
+
+ /* Save virtio queue info */
+ for (i = 0; i < vc->vc_nvq; i++) {
+ vq = &vs->vs_queues[i];
+
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done);
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done);
+
+ addr_size = vq->vq_qsize * sizeof(struct virtio_desc);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size,
+ false, meta, ret, done);
+
+ addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size,
+ false, meta, ret, done);
+
+ addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size,
+ false, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size(vq->vq_qsize),
+ meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+int
+vi_pci_snapshot(struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct pci_devinst *pi;
+ struct virtio_softc *vs;
+ struct virtio_consts *vc;
+
+ pi = meta->dev_data;
+ vs = pi->pi_arg;
+ vc = vs->vs_vc;
+
+ /* Save virtio softc */
+ ret = vi_pci_snapshot_softc(vs, meta);
+ if (ret != 0)
+ goto done;
+
+ /* Save virtio consts */
+ ret = vi_pci_snapshot_consts(vc, meta);
+ if (ret != 0)
+ goto done;
+
+ /* Save virtio queue info */
+ ret = vi_pci_snapshot_queues(vs, meta);
+ if (ret != 0)
+ goto done;
+
+ /* Save device softc, if needed */
+ if (vc->vc_snapshot != NULL) {
+ ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta);
+ if (ret != 0)
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index ab95d9d213e3..e9432e012b27 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -287,6 +287,7 @@ vring_size(u_int qsz)
struct vmctx;
struct pci_devinst;
struct vqueue_info;
+struct vm_snapshot_meta;
/*
* A virtual device, with some number (possibly 0) of virtual
@@ -361,6 +362,10 @@ struct virtio_consts {
void (*vc_apply_features)(void *, uint64_t);
/* called to apply negotiated features */
uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
+ void (*vc_pause)(void *); /* called to pause device activity */
+ void (*vc_resume)(void *); /* called to resume device activity */
+ int (*vc_snapshot)(void *, struct vm_snapshot_meta *);
+ /* called to save / restore device state */
};
/*
@@ -491,4 +496,9 @@ uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size);
void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value);
+#ifdef BHYVE_SNAPSHOT
+int vi_pci_snapshot(struct vm_snapshot_meta *meta);
+int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi);
+int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi);
+#endif
#endif /* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile
index 0ffca5675cb9..58eaf49dae3a 100644
--- a/usr.sbin/bhyvectl/Makefile
+++ b/usr.sbin/bhyvectl/Makefile
@@ -2,6 +2,8 @@
# $FreeBSD$
#
+.include <src.opts.mk>
+
PROG= bhyvectl
SRCS= bhyvectl.c
PACKAGE= bhyve
@@ -14,4 +16,8 @@ WARNS?= 3
CFLAGS+= -I${SRCTOP}/sys/amd64/vmm
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+CFLAGS+= -DBHYVE_SNAPSHOT
+.endif
+
.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyvectl/bhyvectl.8 b/usr.sbin/bhyvectl/bhyvectl.8
index 035f9f6c7586..6adf87ca4537 100644
--- a/usr.sbin/bhyvectl/bhyvectl.8
+++ b/usr.sbin/bhyvectl/bhyvectl.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd November 13, 2016
+.Dd May 04, 2020
.Dt BHYVECTL 8
.Os
.Sh NAME
@@ -39,6 +39,8 @@
.Op Fl -inject-nmi
.Op Fl -force-reset
.Op Fl -force-poweroff
+.Op Fl -checkpoint= Ns Ar <filename>
+.Op Fl -suspend= Ns Ar <filename>
.Sh DESCRIPTION
The
.Nm
@@ -72,6 +74,17 @@ Inject a non-maskable interrupt (NMI) into the VM.
Force the VM to reset.
.It Fl -force-poweroff
Force the VM to power off.
+.It Fl -checkpoint= Ns Ar <filename>
+Save a snapshot of a virtual machine.
+The guest memory contents are saved in the file given in
+.Ar <filename> .
+The guest device and vCPU state are saved in the file
+.Ar <filename>.kern .
+.It Fl -suspend= Ns Ar <filename>
+Save a snapshot of a virtual machine similar to
+.Fl -checkpoint .
+The virtual machine will terminate after the snapshot has been
+saved.
.El
.Sh EXIT STATUS
.Ex -std
@@ -79,6 +92,10 @@ Force the VM to power off.
Destroy the VM called fbsd10:
.Pp
.Dl "bhyvectl --vm=fbsd10 --destroy"
+.Sh COMPATIBILITY
+The snapshot file format is not yet stable and is subject to future changes.
+Backwards compatibility support for the current snapshot file format is not
+guaranteed when future changes are made.
.Sh SEE ALSO
.Xr bhyve 8 ,
.Xr bhyveload 8
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index 8274e6eafccb..d2c4a1488fe8 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm_dev.h>
#include <vmmapi.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
#include "amd/vmcb.h"
#include "intel/vmcs.h"
@@ -67,6 +70,9 @@ __FBSDID("$FreeBSD$");
#define NO_ARG no_argument
#define OPT_ARG optional_argument
+#define CHECKPOINT_RUN_DIR "/var/run/bhyve/checkpoint"
+#define MAX_VMNAME 100
+
static const char *progname;
static void
@@ -78,6 +84,10 @@ usage(bool cpu_intel)
" [--cpu=<vcpu_number>]\n"
" [--create]\n"
" [--destroy]\n"
+#ifdef BHYVE_SNAPSHOT
+ " [--checkpoint=<filename>]\n"
+ " [--suspend=<filename>]\n"
+#endif
" [--get-all]\n"
" [--get-stats]\n"
" [--set-desc-ds]\n"
@@ -287,6 +297,10 @@ enum x2apic_state x2apic_state;
static int unassign_pptdev, bus, slot, func;
static int run;
static int get_cpu_topology;
+#ifdef BHYVE_SNAPSHOT
+static int vm_checkpoint_opt;
+static int vm_suspend_opt;
+#endif
/*
* VMCB specific.
@@ -591,6 +605,10 @@ enum {
SET_RTC_TIME,
SET_RTC_NVRAM,
RTC_NVRAM_OFFSET,
+#ifdef BHYVE_SNAPSHOT
+ SET_CHECKPOINT_FILE,
+ SET_SUSPEND_FILE,
+#endif
};
static void
@@ -1461,6 +1479,10 @@ setup_options(bool cpu_intel)
{ "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 },
{ "get-intinfo", NO_ARG, &get_intinfo, 1 },
{ "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 },
+#ifdef BHYVE_SNAPSHOT
+ { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE},
+ { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE},
+#endif
};
const struct option intel_opts[] = {
@@ -1678,6 +1700,82 @@ show_memseg(struct vmctx *ctx)
}
}
+#ifdef BHYVE_SNAPSHOT
+static int
+send_checkpoint_op_req(struct vmctx *ctx, struct checkpoint_op *op)
+{
+ struct sockaddr_un addr;
+ int socket_fd, len, len_sent, total_sent;
+ int err = 0;
+ char vmname_buf[MAX_VMNAME];
+
+ socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (socket_fd < 0) {
+ perror("Error creating bhyvectl socket");
+ err = -1;
+ goto done;
+ }
+
+ memset(&addr, 0, sizeof(struct sockaddr_un));
+ addr.sun_family = AF_UNIX;
+
+ err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+ if (err != 0) {
+ perror("Failed to get VM name");
+ goto done;
+ }
+
+ snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", CHECKPOINT_RUN_DIR, vmname_buf);
+
+ if (connect(socket_fd, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_un)) != 0) {
+ perror("Connect to VM socket failed");
+ err = -1;
+ goto done;
+ }
+
+ len = sizeof(*op);
+ total_sent = 0;
+ while ((len_sent = send(socket_fd, (char *)op + total_sent, len - total_sent, 0)) > 0) {
+ total_sent += len_sent;
+ }
+
+ if (len_sent < 0) {
+ perror("Failed to send checkpoint operation request");
+ err = -1;
+ }
+
+done:
+ if (socket_fd > 0)
+ close(socket_fd);
+ return (err);
+}
+
+static int
+send_start_checkpoint(struct vmctx *ctx, const char *checkpoint_file)
+{
+ struct checkpoint_op op;
+
+ op.op = START_CHECKPOINT;
+ strncpy(op.snapshot_filename, checkpoint_file, MAX_SNAPSHOT_VMNAME);
+ op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0;
+
+ return (send_checkpoint_op_req(ctx, &op));
+}
+
+static int
+send_start_suspend(struct vmctx *ctx, const char *suspend_file)
+{
+ struct checkpoint_op op;
+
+ op.op = START_SUSPEND;
+ strncpy(op.snapshot_filename, suspend_file, MAX_SNAPSHOT_VMNAME);
+ op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0;
+
+ return (send_checkpoint_op_req(ctx, &op));
+}
+#endif
+
int
main(int argc, char *argv[])
{
@@ -1694,6 +1792,9 @@ main(int argc, char *argv[])
uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
struct tm tm;
struct option *opts;
+#ifdef BHYVE_SNAPSHOT
+ char *checkpoint_file, *suspend_file;
+#endif
cpu_intel = cpu_vendor_intel();
opts = setup_options(cpu_intel);
@@ -1860,6 +1961,16 @@ main(int argc, char *argv[])
case ASSERT_LAPIC_LVT:
assert_lapic_lvt = atoi(optarg);
break;
+#ifdef BHYVE_SNAPSHOT
+ case SET_CHECKPOINT_FILE:
+ vm_checkpoint_opt = 1;
+ checkpoint_file = optarg;
+ break;
+ case SET_SUSPEND_FILE:
+ vm_suspend_opt = 1;
+ suspend_file = optarg;
+ break;
+#endif
default:
usage(cpu_intel);
}
@@ -2345,6 +2456,14 @@ main(int argc, char *argv[])
if (!error && destroy)
vm_destroy(ctx);
+#ifdef BHYVE_SNAPSHOT
+ if (!error && vm_checkpoint_opt)
+ error = send_start_checkpoint(ctx, checkpoint_file);
+
+ if (!error && vm_suspend_opt)
+ error = send_start_suspend(ctx, suspend_file);
+#endif
+
free (opts);
exit(error);
}