aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Baldwin <jhb@FreeBSD.org>2020-05-05 00:02:04 +0000
committerJohn Baldwin <jhb@FreeBSD.org>2020-05-05 00:02:04 +0000
commit483d953a86a2507355f8287c5107dc827a0ff516 (patch)
treeb74b1559959b80cd48bffb9763b67959fd4c182b
parent51a5392297a7a014b1cf367922359cd54fb7a393 (diff)
downloadsrc-483d953a86a.tar.gz
src-483d953a86a.zip
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot to be taken of a guest's state that can later be resumed. In the current implementation, bhyve(8) creates a UNIX domain socket that is used by bhyvectl(8) to send a request to save a snapshot (and optionally exit after the snapshot has been taken). A snapshot currently consists of two files: the first holds a copy of guest RAM, and the second file holds other guest state such as vCPU register values and device model state. To resume a guest, bhyve(8) must be started with a matching pair of command line arguments to instantiate the same set of device models as well as a pointer to the saved snapshot. While the current implementation is useful for several uses cases, it has a few limitations. The file format for saving the guest state is tied to the ABI of internal bhyve structures and is not self-describing (in that it does not communicate the set of device models present in the system). In addition, the state saved for some device models closely matches the internal data structures which might prove a challenge for compatibility of snapshot files across a range of bhyve versions. The file format also does not currently support versioning of individual chunks of state. As a result, the current file format is not a fixed binary format and future revisions to save and restore will break binary compatiblity of snapshot files. The goal is to move to a more flexible format that adds versioning, etc. and at that point to commit to providing a reasonable level of compatibility. As a result, the current implementation is not enabled by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option for userland builds, and the kernel option BHYVE_SHAPSHOT. Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz Relnotes: yes Sponsored by: University Politehnica of Bucharest Sponsored by: Matthew Grooms (student scholarships) Sponsored by: iXsystems Differential Revision: https://reviews.freebsd.org/D19495
Notes
Notes: svn path=/head/; revision=360648
-rw-r--r--lib/libvmmapi/vmmapi.c65
-rw-r--r--lib/libvmmapi/vmmapi.h29
-rw-r--r--share/man/man5/src.conf.59
-rw-r--r--share/mk/src.opts.mk1
-rw-r--r--sys/amd64/include/vmm.h24
-rw-r--r--sys/amd64/include/vmm_dev.h11
-rw-r--r--sys/amd64/include/vmm_snapshot.h156
-rw-r--r--sys/amd64/vmm/amd/svm.c357
-rw-r--r--sys/amd64/vmm/amd/svm.h4
-rw-r--r--sys/amd64/vmm/amd/svm_msr.c7
-rw-r--r--sys/amd64/vmm/amd/vmcb.c106
-rw-r--r--sys/amd64/vmm/amd/vmcb.h14
-rw-r--r--sys/amd64/vmm/intel/vmcs.c124
-rw-r--r--sys/amd64/vmm/intel/vmcs.h13
-rw-r--r--sys/amd64/vmm/intel/vmx.c164
-rw-r--r--sys/amd64/vmm/io/vatpic.c43
-rw-r--r--sys/amd64/vmm/io/vatpic.h6
-rw-r--r--sys/amd64/vmm/io/vatpit.c42
-rw-r--r--sys/amd64/vmm/io/vatpit.h5
-rw-r--r--sys/amd64/vmm/io/vhpet.c49
-rw-r--r--sys/amd64/vmm/io/vhpet.h6
-rw-r--r--sys/amd64/vmm/io/vioapic.c22
-rw-r--r--sys/amd64/vmm/io/vioapic.h7
-rw-r--r--sys/amd64/vmm/io/vlapic.c106
-rw-r--r--sys/amd64/vmm/io/vlapic.h6
-rw-r--r--sys/amd64/vmm/io/vpmtmr.c16
-rw-r--r--sys/amd64/vmm/io/vpmtmr.h5
-rw-r--r--sys/amd64/vmm/io/vrtc.c45
-rw-r--r--sys/amd64/vmm/io/vrtc.h5
-rw-r--r--sys/amd64/vmm/vmm.c194
-rw-r--r--sys/amd64/vmm/vmm_dev.c17
-rw-r--r--sys/amd64/vmm/vmm_snapshot.c141
-rw-r--r--sys/conf/config.mk7
-rw-r--r--sys/conf/kern.opts.mk1
-rw-r--r--sys/conf/options.amd641
-rw-r--r--sys/modules/vmm/Makefile9
-rw-r--r--tools/build/options/WITH_BHYVE_SNAPSHOT7
-rw-r--r--usr.sbin/bhyve/Makefile15
-rw-r--r--usr.sbin/bhyve/Makefile.depend2
-rw-r--r--usr.sbin/bhyve/atkbdc.c46
-rw-r--r--usr.sbin/bhyve/atkbdc.h5
-rw-r--r--usr.sbin/bhyve/bhyve.818
-rw-r--r--usr.sbin/bhyve/bhyverun.c196
-rw-r--r--usr.sbin/bhyve/bhyverun.h5
-rw-r--r--usr.sbin/bhyve/block_if.c143
-rw-r--r--usr.sbin/bhyve/block_if.h11
-rw-r--r--usr.sbin/bhyve/mevent.c2
-rw-r--r--usr.sbin/bhyve/pci_ahci.c302
-rw-r--r--usr.sbin/bhyve/pci_e82545.c161
-rw-r--r--usr.sbin/bhyve/pci_emul.c208
-rw-r--r--usr.sbin/bhyve/pci_emul.h11
-rw-r--r--usr.sbin/bhyve/pci_fbuf.c19
-rw-r--r--usr.sbin/bhyve/pci_lpc.c26
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c51
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c83
-rw-r--r--usr.sbin/bhyve/pci_xhci.c264
-rw-r--r--usr.sbin/bhyve/ps2kbd.c17
-rw-r--r--usr.sbin/bhyve/ps2kbd.h5
-rw-r--r--usr.sbin/bhyve/ps2mouse.c24
-rw-r--r--usr.sbin/bhyve/ps2mouse.h5
-rw-r--r--usr.sbin/bhyve/snapshot.c1742
-rw-r--r--usr.sbin/bhyve/snapshot.h105
-rw-r--r--usr.sbin/bhyve/uart_emul.c34
-rw-r--r--usr.sbin/bhyve/uart_emul.h5
-rw-r--r--usr.sbin/bhyve/usb_emul.h4
-rw-r--r--usr.sbin/bhyve/usb_mouse.c30
-rw-r--r--usr.sbin/bhyve/virtio.c148
-rw-r--r--usr.sbin/bhyve/virtio.h10
-rw-r--r--usr.sbin/bhyvectl/Makefile6
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.819
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.c119
71 files changed, 5616 insertions, 49 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 11f38c926cc3..7347c41dd311 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <errno.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
@@ -53,8 +54,10 @@ __FBSDID("$FreeBSD$");
#include <libutil.h>
+#include <vm/vm.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
+#include <machine/vmm_snapshot.h>
#include "vmmapi.h"
@@ -238,6 +241,17 @@ vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
}
int
+vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
+ size_t *lowmem_size, size_t *highmem_size)
+{
+
+ *guest_baseaddr = ctx->baseaddr;
+ *lowmem_size = ctx->lowmem;
+ *highmem_size = ctx->highmem;
+ return (0);
+}
+
+int
vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
{
@@ -448,6 +462,34 @@ vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
return (NULL);
}
+vm_paddr_t
+vm_rev_map_gpa(struct vmctx *ctx, void *addr)
+{
+ vm_paddr_t offaddr;
+
+ offaddr = (char *)addr - ctx->baseaddr;
+
+ if (ctx->lowmem > 0)
+ if (offaddr >= 0 && offaddr <= ctx->lowmem)
+ return (offaddr);
+
+ if (ctx->highmem > 0)
+ if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem)
+ return (offaddr);
+
+ return ((vm_paddr_t)-1);
+}
+
+/* TODO: maximum size for vmname */
+int
+vm_get_name(struct vmctx *ctx, char *buf, size_t max_len)
+{
+
+ if (strlcpy(buf, ctx->name, max_len) >= max_len)
+ return (EINVAL);
+ return (0);
+}
+
size_t
vm_get_lowmem_size(struct vmctx *ctx)
{
@@ -1502,6 +1544,29 @@ vm_restart_instruction(void *arg, int vcpu)
}
int
+vm_snapshot_req(struct vm_snapshot_meta *meta)
+{
+
+ if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
+#ifdef SNAPSHOT_DEBUG
+ fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
+ __func__, meta->dev_name, errno);
+#endif
+ return (-1);
+ }
+ return (0);
+}
+
+int
+vm_restore_time(struct vmctx *ctx)
+{
+ int dummy;
+
+ dummy = 0;
+ return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
+}
+
+int
vm_set_topology(struct vmctx *ctx,
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
{
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 9819cda16bd9..2b026031b50f 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -33,6 +33,7 @@
#include <sys/param.h>
#include <sys/cpuset.h>
+#include <machine/vmm_dev.h>
/*
* API version for out-of-tree consumers like grub-bhyve for making compile
@@ -42,6 +43,7 @@
struct iovec;
struct vmctx;
+struct vm_snapshot_meta;
enum x2apic_state;
/*
@@ -88,6 +90,10 @@ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
*/
int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+
+int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
+ size_t *lowmem_size, size_t *highmem_size);
+
/*
* Create a device memory segment identified by 'segid'.
*
@@ -110,6 +116,8 @@ void vm_destroy(struct vmctx *ctx);
int vm_parse_memsize(const char *optarg, size_t *memsize);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+/* inverse operation to vm_map_gpa - extract guest address from host pointer */
+vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *fault);
@@ -120,6 +128,7 @@ uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
void vm_set_memflags(struct vmctx *ctx, int flags);
int vm_get_memflags(struct vmctx *ctx);
+int vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len);
size_t vm_get_lowmem_size(struct vmctx *ctx);
size_t vm_get_highmem_size(struct vmctx *ctx);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
@@ -237,4 +246,24 @@ int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
uint32_t eip, uint32_t gdtbase,
uint32_t esp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);
+
+/*
+ * Save and restore
+ */
+
+#define MAX_SNAPSHOT_VMNAME 100
+
+enum checkpoint_opcodes {
+ START_CHECKPOINT = 0,
+ START_SUSPEND = 1,
+};
+
+struct checkpoint_op {
+ unsigned int op;
+ char snapshot_filename[MAX_SNAPSHOT_VMNAME];
+};
+
+int vm_snapshot_req(struct vm_snapshot_meta *meta);
+int vm_restore_time(struct vmctx *ctx);
+
#endif /* _VMMAPI_H_ */
diff --git a/share/man/man5/src.conf.5 b/share/man/man5/src.conf.5
index da3a1f9c3044..4d28f019e100 100644
--- a/share/man/man5/src.conf.5
+++ b/share/man/man5/src.conf.5
@@ -1,6 +1,6 @@
.\" DO NOT EDIT-- this file is @generated by tools/build/options/makeman.
.\" $FreeBSD$
-.Dd April 30, 2020
+.Dd May 4, 2020
.Dt SRC.CONF 5
.Os
.Sh NAME
@@ -170,6 +170,13 @@ Set to not build or install
associated utilities, and examples.
.Pp
This option only affects amd64/amd64.
+.It Va WITH_BHYVE_SNAPSHOT
+Set to include support for save and restore (snapshots) in
+.Xr bhyve 8
+and
+.Xr bhyvectl 8 .
+.Pp
+This option only affects amd64/amd64.
.It Va WITH_BIND_NOW
Build all binaries with the
.Dv DF_BIND_NOW
diff --git a/share/mk/src.opts.mk b/share/mk/src.opts.mk
index fc03abf50c61..5b2c76452a27 100644
--- a/share/mk/src.opts.mk
+++ b/share/mk/src.opts.mk
@@ -200,6 +200,7 @@ __DEFAULT_YES_OPTIONS = \
__DEFAULT_NO_OPTIONS = \
BEARSSL \
+ BHYVE_SNAPSHOT \
BSD_GREP \
CLANG_EXTRAS \
DTRACE_TESTS \
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index a08c90ed20be..70909510c983 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -34,6 +34,8 @@
#include <sys/sdt.h>
#include <x86/segments.h>
+struct vm_snapshot_meta;
+
#ifdef _KERNEL
SDT_PROVIDER_DECLARE(vmm);
#endif
@@ -152,6 +154,7 @@ struct vmspace;
struct vm_object;
struct vm_guest_paging;
struct pmap;
+enum snapshot_req;
struct vm_eventinfo {
void *rptr; /* rendezvous cookie */
@@ -180,6 +183,10 @@ typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
+typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta);
+typedef int (*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta,
+ int vcpu);
+typedef int (*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
@@ -199,6 +206,11 @@ struct vmm_ops {
vmi_vmspace_free vmspace_free;
vmi_vlapic_init vlapic_init;
vmi_vlapic_cleanup vlapic_cleanup;
+
+ /* checkpoint operations */
+ vmi_snapshot_t vmsnapshot;
+ vmi_snapshot_vmcx_t vmcx_snapshot;
+ vmi_restore_tsc_t vm_restore_tsc;
};
extern struct vmm_ops vmm_ops_intel;
@@ -272,6 +284,9 @@ void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
+int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
+int vm_restore_time(struct vm *vm);
+
#ifdef _SYS__CPUSET_H_
/*
@@ -409,6 +424,15 @@ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
+/*
+ * Function used to keep track of the guest's TSC offset. The
+ * offset is used by the virutalization extensions to provide a consistent
+ * value for the Time Stamp Counter to the guest.
+ *
+ * Return value is 0 on success and non-zero on failure.
+ */
+int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset);
+
enum vm_reg_name vm_segment_name(int seg_encoding);
struct vm_copyinfo {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index bd806e7678f4..21775b70718e 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -31,6 +31,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+struct vm_snapshot_meta;
+
#ifdef _KERNEL
void vmmdev_init(void);
int vmmdev_cleanup(void);
@@ -312,6 +314,11 @@ enum {
IOCNUM_RTC_WRITE = 101,
IOCNUM_RTC_SETTIME = 102,
IOCNUM_RTC_GETTIME = 103,
+
+ /* checkpoint */
+ IOCNUM_SNAPSHOT_REQ = 113,
+
+ IOCNUM_RESTORE_TIME = 115
};
#define VM_RUN \
@@ -422,4 +429,8 @@ enum {
_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
#define VM_RESTART_INSTRUCTION \
_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
+#define VM_SNAPSHOT_REQ \
+ _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta)
+#define VM_RESTORE_TIME \
+ _IOWR('v', IOCNUM_RESTORE_TIME, int)
#endif
diff --git a/sys/amd64/include/vmm_snapshot.h b/sys/amd64/include/vmm_snapshot.h
new file mode 100644
index 000000000000..6ba25a5dae2e
--- /dev/null
+++ b/sys/amd64/include/vmm_snapshot.h
@@ -0,0 +1,156 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_SNAPSHOT_
+#define _VMM_SNAPSHOT_
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
+
+struct vmctx;
+
+enum snapshot_req {
+ STRUCT_VMX,
+ STRUCT_VIOAPIC,
+ STRUCT_VM,
+ STRUCT_VLAPIC,
+ VM_MEM,
+ STRUCT_VHPET,
+ STRUCT_VMCX,
+ STRUCT_VATPIC,
+ STRUCT_VATPIT,
+ STRUCT_VPMTMR,
+ STRUCT_VRTC,
+};
+
+struct vm_snapshot_buffer {
+ /*
+ * R/O for device-specific functions;
+ * written by generic snapshot functions.
+ */
+ uint8_t *const buf_start;
+ const size_t buf_size;
+
+ /*
+ * R/W for device-specific functions used to keep track of buffer
+ * current position and remaining size.
+ */
+ uint8_t *buf;
+ size_t buf_rem;
+
+ /*
+ * Length of the snapshot is either determined as (buf_size - buf_rem)
+ * or (buf - buf_start) -- the second variation returns a signed value
+ * so it may not be appropriate.
+ *
+ * Use vm_get_snapshot_size(meta).
+ */
+};
+
+enum vm_snapshot_op {
+ VM_SNAPSHOT_SAVE,
+ VM_SNAPSHOT_RESTORE,
+};
+
+struct vm_snapshot_meta {
+ struct vmctx *ctx;
+ void *dev_data;
+ const char *dev_name; /* identify userspace devices */
+ enum snapshot_req dev_req; /* identify kernel structs */
+
+ struct vm_snapshot_buffer buffer;
+
+ enum vm_snapshot_op op;
+};
+
+
+void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op);
+int vm_snapshot_buf(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta);
+size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta);
+int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null,
+ struct vm_snapshot_meta *meta);
+int vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta);
+
+#define SNAPSHOT_BUF_OR_LEAVE(DATA, LEN, META, RES, LABEL) \
+do { \
+ (RES) = vm_snapshot_buf((DATA), (LEN), (META)); \
+ if ((RES) != 0) { \
+ vm_snapshot_buf_err(#DATA, (META)->op); \
+ goto LABEL; \
+ } \
+} while (0)
+
+#define SNAPSHOT_VAR_OR_LEAVE(DATA, META, RES, LABEL) \
+ SNAPSHOT_BUF_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL)
+
+/*
+ * Address variables are pointers to guest memory.
+ *
+ * When RNULL != 0, do not enforce invalid address checks; instead, make the
+ * pointer NULL at restore time.
+ */
+#define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ADDR, LEN, RNULL, META, RES, LABEL) \
+do { \
+ (RES) = vm_snapshot_guest2host_addr((void **)&(ADDR), (LEN), (RNULL), \
+ (META)); \
+ if ((RES) != 0) { \
+ if ((RES) == EFAULT) \
+ fprintf(stderr, "%s: invalid address: %s\r\n", \
+ __func__, #ADDR); \
+ goto LABEL; \
+ } \
+} while (0)
+
+/* compare the value in the meta buffer with the data */
+#define SNAPSHOT_BUF_CMP_OR_LEAVE(DATA, LEN, META, RES, LABEL) \
+do { \
+ (RES) = vm_snapshot_buf_cmp((DATA), (LEN), (META)); \
+ if ((RES) != 0) { \
+ vm_snapshot_buf_err(#DATA, (META)->op); \
+ goto LABEL; \
+ } \
+} while (0)
+
+#define SNAPSHOT_VAR_CMP_OR_LEAVE(DATA, META, RES, LABEL) \
+ SNAPSHOT_BUF_CMP_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL)
+
+#endif
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index d3ba62b4b19c..f9660024fe0c 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -29,6 +29,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/smp.h>
@@ -50,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
@@ -276,6 +279,25 @@ svm_restore(void)
svm_enable(NULL);
}
+#ifdef BHYVE_SNAPSHOT
+int
+svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset)
+{
+ int error;
+ struct vmcb_ctrl *ctrl;
+
+ ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+ ctrl->tsc_offset = offset;
+
+ svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
+ VCPU_CTR1(sc->vm, vcpu, "tsc offset changed to %#lx", offset);
+
+ error = vm_set_tsc_offset(sc->vm, vcpu, offset);
+
+ return (error);
+}
+#endif
+
/* Pentium compatible MSRs */
#define MSR_PENTIUM_START 0
#define MSR_PENTIUM_END 0x1FFF
@@ -2203,6 +2225,36 @@ svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
return (EINVAL);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+svm_snapshot_reg(void *arg, int vcpu, int ident,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ uint64_t val;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = svm_getreg(arg, vcpu, ident, &val);
+ if (ret != 0)
+ goto done;
+
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+ ret = svm_setreg(arg, vcpu, ident, val);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
+
static int
svm_setcap(void *arg, int vcpu, int type, int val)
{
@@ -2285,6 +2337,306 @@ svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
free(vlapic, M_SVM_VLAPIC);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+svm_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta)
+{
+ /* struct svm_softc is AMD's representation for SVM softc */
+ struct svm_softc *sc;
+ struct svm_vcpu *vcpu;
+ struct vmcb *vmcb;
+ uint64_t val;
+ int i;
+ int ret;
+
+ sc = arg;
+
+ KASSERT(sc != NULL, ("%s: arg was NULL", __func__));
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->nptp, meta, ret, done);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu = &sc->vcpu[i];
+ vmcb = &vcpu->vmcb;
+
+ /* VMCB fields for virtual cpu i */
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.v_tpr, meta, ret, done);
+ val = vmcb->ctrl.v_tpr;
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ vmcb->ctrl.v_tpr = val;
+
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.asid, meta, ret, done);
+ val = vmcb->ctrl.np_enable;
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ vmcb->ctrl.np_enable = val;
+
+ val = vmcb->ctrl.intr_shadow;
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ vmcb->ctrl.intr_shadow = val;
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.tlb_ctrl, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad1,
+ sizeof(vmcb->state.pad1),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cpl, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad2,
+ sizeof(vmcb->state.pad2),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.efer, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad3,
+ sizeof(vmcb->state.pad3),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr4, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr3, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr7, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr6, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rflags, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rip, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad4,
+ sizeof(vmcb->state.pad4),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rsp, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad5,
+ sizeof(vmcb->state.pad5),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rax, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.star, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.lstar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cstar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sfmask, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.kernelgsbase,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_cs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_esp,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_eip,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr2, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad6,
+ sizeof(vmcb->state.pad6),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.g_pat, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dbgctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_from, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_to, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_from, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_to, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad7,
+ sizeof(vmcb->state.pad7),
+ meta, ret, done);
+
+ /* Snapshot swctx for virtual cpu i */
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr3, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr6, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr7, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_debugctl, meta, ret,
+ done);
+
+ /* Restore other svm_vcpu struct fields */
+
+ /* Restore NEXTRIP field */
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
+
+ /* Restore lastcpu field */
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, ret, done);
+
+ /* Restore EPTGEN field - EPT is Extended Page Tabel */
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, ret, done);
+
+ /* Set all caches dirty */
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ svm_set_dirty(sc, i, VMCB_CACHE_ASID);
+ svm_set_dirty(sc, i, VMCB_CACHE_IOPM);
+ svm_set_dirty(sc, i, VMCB_CACHE_I);
+ svm_set_dirty(sc, i, VMCB_CACHE_TPR);
+ svm_set_dirty(sc, i, VMCB_CACHE_CR2);
+ svm_set_dirty(sc, i, VMCB_CACHE_CR);
+ svm_set_dirty(sc, i, VMCB_CACHE_DT);
+ svm_set_dirty(sc, i, VMCB_CACHE_SEG);
+ svm_set_dirty(sc, i, VMCB_CACHE_NP);
+ }
+ }
+
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ flush_by_asid();
+
+done:
+ return (ret);
+}
+
+static int
+svm_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu)
+{
+ struct vmcb *vmcb;
+ struct svm_softc *sc;
+ int err, running, hostcpu;
+
+ sc = (struct svm_softc *)arg;
+ err = 0;
+
+ KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
+ vmcb = svm_get_vmcb(sc, vcpu);
+
+ running = vcpu_is_running(sc->vm, vcpu, &hostcpu);
+ if (running && hostcpu !=curcpu) {
+ printf("%s: %s%d is running", __func__, vm_name(sc->vm), vcpu);
+ return (EINVAL);
+ }
+
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR0, meta);
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR2, meta);
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR3, meta);
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR4, meta);
+
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DR7, meta);
+
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RAX, meta);
+
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RSP, meta);
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RIP, meta);
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RFLAGS, meta);
+
+ /* Guest segments */
+ /* ES */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_ES, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_ES, meta);
+
+ /* CS */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CS, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_CS, meta);
+
+ /* SS */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_SS, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_SS, meta);
+
+ /* DS */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DS, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_DS, meta);
+
+ /* FS */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_FS, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_FS, meta);
+
+ /* GS */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_GS, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GS, meta);
+
+ /* TR */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_TR, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_TR, meta);
+
+ /* LDTR */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_LDTR, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_LDTR, meta);
+
+ /* EFER */
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_EFER, meta);
+
+ /* IDTR and GDTR */
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_IDTR, meta);
+ err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GDTR, meta);
+
+ /* Specific AMD registers */
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_NPT_BASE, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_IO_PERM, 8), meta);
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_MSR_PERM, 8), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_ASID, 4), meta);
+
+ err += vmcb_snapshot_any(sc, vcpu,
+ VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta);
+
+ err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_INTR_SHADOW, meta);
+
+ return (err);
+}
+
+static int
+svm_restore_tsc(void *arg, int vcpu, uint64_t offset)
+{
+ int err;
+
+ err = svm_set_tsc_offset(arg, vcpu, offset);
+
+ return (err);
+}
+#endif
+
struct vmm_ops vmm_ops_amd = {
.init = svm_init,
.cleanup = svm_cleanup,
@@ -2302,4 +2654,9 @@ struct vmm_ops vmm_ops_amd = {
.vmspace_free = svm_npt_free,
.vlapic_init = svm_vlapic_init,
.vlapic_cleanup = svm_vlapic_cleanup,
+#ifdef BHYVE_SNAPSHOT
+ .vmsnapshot = svm_snapshot_vmi,
+ .vmcx_snapshot = svm_snapshot_vmcx,
+ .vm_restore_tsc = svm_restore_tsc,
+#endif
};
diff --git a/sys/amd64/vmm/amd/svm.h b/sys/amd64/vmm/amd/svm.h
index 66b584fc95b1..30e58b9e130f 100644
--- a/sys/amd64/vmm/amd/svm.h
+++ b/sys/amd64/vmm/amd/svm.h
@@ -32,6 +32,7 @@
#define _SVM_H_
struct pcpu;
+struct svm_softc;
/*
* Guest register state that is saved outside the VMCB.
@@ -66,5 +67,8 @@ struct svm_regctx {
};
void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu);
+#ifdef BHYVE_SNAPSHOT
+int svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset);
+#endif
#endif /* _SVM_H_ */
diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c
index 67c43100f168..12046de4dbb9 100644
--- a/sys/amd64/vmm/amd/svm_msr.c
+++ b/sys/amd64/vmm/amd/svm_msr.c
@@ -29,6 +29,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/systm.h>
@@ -162,6 +164,11 @@ svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
* Ignore writes to microcode update register.
*/
break;
+#ifdef BHYVE_SNAPSHOT
+ case MSR_TSC:
+ error = svm_set_tsc_offset(sc, vcpu, val - rdtsc());
+ break;
+#endif
case MSR_EXTFEATURES:
break;
default:
diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c
index 5075b6986730..59baa06112f2 100644
--- a/sys/amd64/vmm/amd/vmcb.c
+++ b/sys/amd64/vmm/amd/vmcb.c
@@ -29,12 +29,15 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_ktr.h"
@@ -452,3 +455,106 @@ vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
return (0);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val)
+{
+ int error = 0;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto err;
+ }
+
+ if (ident >= VM_REG_LAST) {
+ error = EINVAL;
+ goto err;
+ }
+
+ error = vm_get_register(sc->vm, vcpu, ident, val);
+
+err:
+ return (error);
+}
+
+int
+vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
+{
+ int error = 0;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto err;
+ }
+
+ if (ident >= VM_REG_LAST) {
+ error = EINVAL;
+ goto err;
+ }
+
+ error = vm_set_register(sc->vm, vcpu, ident, val);
+
+err:
+ return (error);
+}
+
+int
+vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct seg_desc desc;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = vmcb_getdesc(arg, vcpu, reg, &desc);
+ if (ret != 0)
+ goto done;
+
+ SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+
+ ret = vmcb_setdesc(arg, vcpu, reg, &desc);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ uint64_t val;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = vmcb_getany(sc, vcpu, ident, &val);
+ if (ret != 0)
+ goto done;
+
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+ ret = vmcb_setany(sc, vcpu, ident, val);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/amd/vmcb.h b/sys/amd64/vmm/amd/vmcb.h
index ec7caa91f95e..dd2c90cf25ea 100644
--- a/sys/amd64/vmm/amd/vmcb.h
+++ b/sys/amd64/vmm/amd/vmcb.h
@@ -31,8 +31,6 @@
#ifndef _VMCB_H_
#define _VMCB_H_
-struct svm_softc;
-
#define BIT(n) (1ULL << n)
/*
@@ -209,6 +207,10 @@ struct svm_softc;
#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF)
#ifdef _KERNEL
+
+struct svm_softc;
+struct vm_snapshot_meta;
+
/* VMCB save state area segment format */
struct vmcb_segment {
uint16_t selector;
@@ -331,6 +333,14 @@ int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg);
+#ifdef BHYVE_SNAPSHOT
+int vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val);
+int vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
+int vmcb_snapshot_desc(void *arg, int vcpu, int reg,
+ struct vm_snapshot_meta *meta);
+int vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident,
+ struct vm_snapshot_meta *meta);
+#endif
#endif /* _KERNEL */
#endif /* _VMCB_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 7632ba930f37..4ccdc1f61f34 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -28,6 +28,7 @@
* $FreeBSD$
*/
+#include "opt_bhyve_snapshot.h"
#include "opt_ddb.h"
#include <sys/cdefs.h>
@@ -43,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <machine/segments.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_host.h"
#include "vmx_cpufunc.h"
#include "vmcs.h"
@@ -430,6 +432,128 @@ done:
return (error);
}
+#ifdef BHYVE_SNAPSHOT
+int
+vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val)
+{
+ int error;
+
+ if (!running)
+ VMPTRLD(vmcs);
+
+ error = vmread(ident, val);
+
+ if (!running)
+ VMCLEAR(vmcs);
+
+ return (error);
+}
+
+int
+vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val)
+{
+ int error;
+
+ if (!running)
+ VMPTRLD(vmcs);
+
+ error = vmwrite(ident, val);
+
+ if (!running)
+ VMCLEAR(vmcs);
+
+ return (error);
+}
+
+int
+vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ uint64_t val;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = vmcs_getreg(vmcs, running, ident, &val);
+ if (ret != 0)
+ goto done;
+
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+ ret = vmcs_setreg(vmcs, running, ident, val);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct seg_desc desc;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = vmcs_getdesc(vmcs, running, seg, &desc);
+ if (ret != 0)
+ goto done;
+
+ SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+
+ ret = vmcs_setdesc(vmcs, running, seg, &desc);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ uint64_t val;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = vmcs_getany(vmcs, running, ident, &val);
+ if (ret != 0)
+ goto done;
+
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+ ret = vmcs_setany(vmcs, running, ident, val);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
+
#ifdef DDB
extern int vmxon_enabled[];
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 29e0263fb9f2..8aa7b1e8fc08 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -32,6 +32,9 @@
#define _VMCS_H_
#ifdef _KERNEL
+
+struct vm_snapshot_meta;
+
struct vmcs {
uint32_t identifier;
uint32_t abort_code;
@@ -55,6 +58,16 @@ int vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
struct seg_desc *desc);
int vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
struct seg_desc *desc);
+#ifdef BHYVE_SNAPSHOT
+int vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val);
+int vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val);
+int vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident,
+ struct vm_snapshot_meta *meta);
+int vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg,
+ struct vm_snapshot_meta *meta);
+int vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident,
+ struct vm_snapshot_meta *meta);
+#endif
/*
* Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 9f610ea50852..21a1b9fdefc4 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -32,6 +32,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/smp.h>
@@ -56,6 +58,8 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
+
#include "vmm_lapic.h"
#include "vmm_host.h"
#include "vmm_ioport.h"
@@ -295,6 +299,9 @@ static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
static void vmx_inject_pir(struct vlapic *vlapic);
+#ifdef BHYVE_SNAPSHOT
+static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now);
+#endif
#ifdef KTR
static const char *
@@ -1299,7 +1306,10 @@ vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
}
error = vmwrite(VMCS_TSC_OFFSET, offset);
-
+#ifdef BHYVE_SNAPSHOT
+ if (error == 0)
+ error = vm_set_tsc_offset(vmx->vm, vcpu, offset);
+#endif
return (error);
}
@@ -3876,6 +3886,153 @@ vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
free(vlapic, M_VLAPIC);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+vmx_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta)
+{
+ struct vmx *vmx;
+ struct vmxctx *vmxctx;
+ int i;
+ int ret;
+
+ vmx = arg;
+
+ KASSERT(vmx != NULL, ("%s: arg was NULL", __func__));
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ SNAPSHOT_BUF_OR_LEAVE(vmx->guest_msrs[i],
+ sizeof(vmx->guest_msrs[i]), meta, ret, done);
+
+ vmxctx = &vmx->ctx[i];
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+static int
+vmx_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu)
+{
+ struct vmcs *vmcs;
+ struct vmx *vmx;
+ int err, run, hostcpu;
+
+ vmx = (struct vmx *)arg;
+ err = 0;
+
+ KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
+ vmcs = &vmx->vmcs[vcpu];
+
+ run = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+ if (run && hostcpu != curcpu) {
+ printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu);
+ return (EINVAL);
+ }
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta);
+
+ /* Guest segments */
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta);
+
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta);
+
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta);
+ err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta);
+
+ /* Guest page tables */
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta);
+ err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta);
+
+ /* Other guest state */
+ err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta);
+ err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta);
+ err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta);
+ err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta);
+ err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta);
+ err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta);
+ err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta);
+
+ return (err);
+}
+
+static int
+vmx_restore_tsc(void *arg, int vcpu, uint64_t offset)
+{
+ struct vmcs *vmcs;
+ struct vmx *vmx = (struct vmx *)arg;
+ int error, running, hostcpu;
+
+ KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
+ vmcs = &vmx->vmcs[vcpu];
+
+ running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+ if (running && hostcpu != curcpu) {
+ printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu);
+ return (EINVAL);
+ }
+
+ if (!running)
+ VMPTRLD(vmcs);
+
+ error = vmx_set_tsc_offset(vmx, vcpu, offset);
+
+ if (!running)
+ VMCLEAR(vmcs);
+ return (error);
+}
+#endif
+
struct vmm_ops vmm_ops_intel = {
.init = vmx_init,
.cleanup = vmx_cleanup,
@@ -3893,4 +4050,9 @@ struct vmm_ops vmm_ops_intel = {
.vmspace_free = ept_vmspace_free,
.vlapic_init = vmx_vlapic_init,
.vlapic_cleanup = vmx_vlapic_cleanup,
+#ifdef BHYVE_SNAPSHOT
+ .vmsnapshot = vmx_snapshot_vmi,
+ .vmcx_snapshot = vmx_snapshot_vmcx,
+ .vm_restore_tsc = vmx_restore_tsc,
+#endif
};
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index ba4cd7785e7d..1e053d26c182 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -29,6 +29,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/queue.h>
@@ -42,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <dev/ic/i8259.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_ktr.h"
#include "vmm_lapic.h"
@@ -808,3 +811,43 @@ vatpic_cleanup(struct vatpic *vatpic)
{
free(vatpic, M_VATPIC);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int i;
+ struct atpic *atpic;
+
+ for (i = 0; i < nitems(vatpic->atpic); i++) {
+ atpic = &vatpic->atpic[i];
+
+ SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done);
+
+ }
+
+ SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc),
+ meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vatpic.h b/sys/amd64/vmm/io/vatpic.h
index d4a1be18208d..8990a2a5fcb0 100644
--- a/sys/amd64/vmm/io/vatpic.h
+++ b/sys/amd64/vmm/io/vatpic.h
@@ -36,6 +36,8 @@
#define IO_ELCR1 0x4d0
#define IO_ELCR2 0x4d1
+struct vm_snapshot_meta;
+
struct vatpic *vatpic_init(struct vm *vm);
void vatpic_cleanup(struct vatpic *vatpic);
@@ -54,4 +56,8 @@ int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
void vatpic_pending_intr(struct vm *vm, int *vecptr);
void vatpic_intr_accepted(struct vm *vm, int vector);
+#ifdef BHYVE_SNAPSHOT
+int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _VATPIC_H_ */
diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c
index 91d64af21233..4718a0557065 100644
--- a/sys/amd64/vmm/io/vatpit.c
+++ b/sys/amd64/vmm/io/vatpit.c
@@ -29,6 +29,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/queue.h>
@@ -39,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_ktr.h"
#include "vatpic.h"
@@ -472,3 +475,42 @@ vatpit_cleanup(struct vatpit *vatpit)
free(vatpit, M_VATPIT);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int i;
+ struct channel *channel;
+
+ SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.frac, meta, ret, done);
+
+ /* properly restore timers; they will NOT work currently */
+ printf("%s: snapshot restore does not reset timers!\r\n", __func__);
+
+ for (i = 0; i < nitems(vatpit->channel); i++) {
+ channel = &vatpit->channel[i];
+
+ SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.frac, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr),
+ meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.frac, meta, ret,
+ done);
+ }
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vatpit.h b/sys/amd64/vmm/io/vatpit.h
index 090d1a6172a7..65e06ec9bf58 100644
--- a/sys/amd64/vmm/io/vatpit.h
+++ b/sys/amd64/vmm/io/vatpit.h
@@ -36,6 +36,8 @@
#define NMISC_PORT 0x61
+struct vm_snapshot_meta;
+
struct vatpit *vatpit_init(struct vm *vm);
void vatpit_cleanup(struct vatpit *vatpit);
@@ -43,5 +45,8 @@ int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax);
int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port,
int bytes, uint32_t *eax);
+#ifdef BHYVE_SNAPSHOT
+int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta);
+#endif
#endif /* _VATPIT_H_ */
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index 8f91f9fe6d78..530f5d49f8f1 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -32,6 +32,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -43,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_lapic.h"
#include "vatpic.h"
@@ -761,3 +764,49 @@ vhpet_getcap(struct vm_hpet_cap *cap)
cap->capabilities = vhpet_capabilities();
return (0);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta)
+{
+ int i, ret;
+ uint32_t countbase;
+
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done);
+
+ /* at restore time the countbase should have the value it had when the
+ * snapshot was created; since the value is not directly kept in
+ * vhpet->countbase, but rather computed relative to the current system
+ * uptime using countbase_sbt, save the value retured by vhpet_counter
+ */
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ countbase = vhpet_counter(vhpet, NULL);
+ SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done);
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ vhpet->countbase = countbase;
+
+ for (i = 0; i < nitems(vhpet->timer); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt,
+ meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+int
+vhpet_restore_time(struct vhpet *vhpet)
+{
+ if (vhpet_counter_enabled(vhpet))
+ vhpet_start_counting(vhpet);
+
+ return (0);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vhpet.h b/sys/amd64/vmm/io/vhpet.h
index 3d6b653055c9..113683c09b33 100644
--- a/sys/amd64/vmm/io/vhpet.h
+++ b/sys/amd64/vmm/io/vhpet.h
@@ -35,6 +35,8 @@
#define VHPET_BASE 0xfed00000
#define VHPET_SIZE 1024
+struct vm_snapshot_meta;
+
struct vhpet *vhpet_init(struct vm *vm);
void vhpet_cleanup(struct vhpet *vhpet);
int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val,
@@ -42,5 +44,9 @@ int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val,
int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val,
int size, void *arg);
int vhpet_getcap(struct vm_hpet_cap *cap);
+#ifdef BHYVE_SNAPSHOT
+int vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta);
+int vhpet_restore_time(struct vhpet *vhpet);
+#endif
#endif /* _VHPET_H_ */
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 31c1cabab094..a8117da4b879 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -32,6 +32,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/lock.h>
@@ -42,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <x86/apicreg.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_ktr.h"
#include "vmm_lapic.h"
@@ -499,3 +502,22 @@ vioapic_pincount(struct vm *vm)
return (REDIR_ENTRIES);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int i;
+
+ SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done);
+
+ for (i = 0; i < nitems(vioapic->rtbl); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vioapic.h b/sys/amd64/vmm/io/vioapic.h
index 730c4b3f2ad9..19dbffe3ec24 100644
--- a/sys/amd64/vmm/io/vioapic.h
+++ b/sys/amd64/vmm/io/vioapic.h
@@ -32,6 +32,8 @@
#ifndef _VIOAPIC_H_
#define _VIOAPIC_H_
+struct vm_snapshot_meta;
+
#define VIOAPIC_BASE 0xFEC00000
#define VIOAPIC_SIZE 4096
@@ -49,4 +51,9 @@ int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa,
int vioapic_pincount(struct vm *vm);
void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector);
+#ifdef BHYVE_SNAPSHOT
+int vioapic_snapshot(struct vioapic *vioapic,
+ struct vm_snapshot_meta *meta);
+#endif
+
#endif
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 069989f12386..be944bf097d2 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -32,6 +32,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/kernel.h>
@@ -47,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <machine/smp.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_lapic.h"
#include "vmm_ktr.h"
@@ -1650,3 +1653,106 @@ vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
vlapic_set_tmr(vlapic, vector, true);
}
+
+#ifdef BHYVE_SNAPSHOT
+static void
+vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr)
+{
+ /* The implementation is similar to the one in the
+ * `vlapic_icrtmr_write_handler` function
+ */
+ sbintime_t sbt;
+ struct bintime bt;
+
+ VLAPIC_TIMER_LOCK(vlapic);
+
+ bt = vlapic->timer_freq_bt;
+ bintime_mul(&bt, ccr);
+
+ if (ccr != 0) {
+ binuptime(&vlapic->timer_fire_bt);
+ bintime_add(&vlapic->timer_fire_bt, &bt);
+
+ sbt = bttosbt(bt);
+ callout_reset_sbt(&vlapic->callout, sbt, 0,
+ vlapic_callout_handler, vlapic, 0);
+ } else {
+ /* even if the CCR was 0, periodic timers should be reset */
+ if (vlapic_periodic_timer(vlapic)) {
+ binuptime(&vlapic->timer_fire_bt);
+ bintime_add(&vlapic->timer_fire_bt,
+ &vlapic->timer_period_bt);
+ sbt = bttosbt(vlapic->timer_period_bt);
+
+ callout_stop(&vlapic->callout);
+ callout_reset_sbt(&vlapic->callout, sbt, 0,
+ vlapic_callout_handler, vlapic, 0);
+ }
+ }
+
+ VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+int
+vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+ int i, ret;
+ struct vlapic *vlapic;
+ struct LAPIC *lapic;
+ uint32_t ccr;
+
+ KASSERT(vm != NULL, ("%s: arg was NULL", __func__));
+
+ ret = 0;
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vlapic = vm_lapic(vm, i);
+
+ /* snapshot the page first; timer period depends on icr_timer */
+ lapic = vlapic->apic_page;
+ SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac,
+ meta, ret, done);
+
+ /*
+ * Timer period is equal to 'icr_timer' ticks at a frequency of
+ * 'timer_freq_bt'.
+ */
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ vlapic->timer_period_bt = vlapic->timer_freq_bt;
+ bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
+ }
+
+ SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk,
+ sizeof(vlapic->isrvec_stk),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last,
+ sizeof(vlapic->lvt_last),
+ meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ ccr = vlapic_get_ccr(vlapic);
+
+ SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ /* Reset the value of the 'timer_fire_bt' and the vlapic
+ * callout based on the value of the current count
+ * register saved when the VM snapshot was created
+ */
+ vlapic_reset_callout(vlapic, ccr);
+ }
+ }
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index bd650efa8cc1..b87657c8bb51 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -32,6 +32,7 @@
#define _VLAPIC_H_
struct vm;
+struct vm_snapshot_meta;
enum x2apic_state;
int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
@@ -109,4 +110,9 @@ void vlapic_icrtmr_write_handler(struct vlapic *vlapic);
void vlapic_dcr_write_handler(struct vlapic *vlapic);
void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset);
void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val);
+
+#ifdef BHYVE_SNAPSHOT
+int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c
index 4df909777d88..f79e94f6d0fe 100644
--- a/sys/amd64/vmm/io/vpmtmr.c
+++ b/sys/amd64/vmm/io/vpmtmr.c
@@ -29,6 +29,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/kernel.h>
@@ -36,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include "vpmtmr.h"
@@ -103,3 +106,16 @@ vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
return (0);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(vpmtmr->baseval, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vpmtmr.h b/sys/amd64/vmm/io/vpmtmr.h
index e6562da5c02e..a10c0b4e8309 100644
--- a/sys/amd64/vmm/io/vpmtmr.h
+++ b/sys/amd64/vmm/io/vpmtmr.h
@@ -34,6 +34,7 @@
#define IO_PMTMR 0x408
struct vpmtmr;
+struct vm_snapshot_meta;
struct vpmtmr *vpmtmr_init(struct vm *vm);
void vpmtmr_cleanup(struct vpmtmr *pmtmr);
@@ -41,4 +42,8 @@ void vpmtmr_cleanup(struct vpmtmr *pmtmr);
int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *val);
+#ifdef BHYVE_SNAPSHOT
+int vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta);
+#endif
+
#endif
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
index 954a78efb588..5d6968e3583e 100644
--- a/sys/amd64/vmm/io/vrtc.c
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -29,6 +29,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/queue.h>
@@ -40,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <isa/rtc.h>
@@ -1020,3 +1023,45 @@ vrtc_cleanup(struct vrtc *vrtc)
callout_drain(&vrtc->callout);
free(vrtc, M_VRTC);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ VRTC_LOCK(vrtc);
+
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done);
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ vrtc->base_uptime = sbinuptime();
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2),
+ meta, ret, done);
+
+ vrtc_callout_reset(vrtc, vrtc_freq(vrtc));
+
+ VRTC_UNLOCK(vrtc);
+
+done:
+ return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h
index 836561c7b93b..791fb7db3e26 100644
--- a/sys/amd64/vmm/io/vrtc.h
+++ b/sys/amd64/vmm/io/vrtc.h
@@ -34,6 +34,7 @@
#include <isa/isareg.h>
struct vrtc;
+struct vm_snapshot_meta;
struct vrtc *vrtc_init(struct vm *vm);
void vrtc_cleanup(struct vrtc *vrtc);
@@ -49,4 +50,8 @@ int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *val);
+#ifdef BHYVE_SNAPSHOT
+int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta);
+#endif
+
#endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 0f6b803098d6..b2f5fa62efe5 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -31,6 +31,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -44,7 +46,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
-#include <sys/systm.h>
+#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
@@ -53,6 +55,11 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vnode_pager.h>
+#include <vm/swap_pager.h>
+#include <vm/uma.h>
#include <machine/cpu.h>
#include <machine/pcb.h>
@@ -64,6 +71,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_ioport.h"
#include "vmm_ktr.h"
@@ -111,6 +119,7 @@ struct vcpu {
void *stats; /* (a,i) statistics */
struct vm_exit exitinfo; /* (x) exit reason and collateral */
uint64_t nextrip; /* (x) next instruction to execute */
+ uint64_t tsc_offset; /* (o) TSC offsetting */
};
#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
@@ -204,6 +213,14 @@ static struct vmm_ops *ops;
(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
#define VLAPIC_CLEANUP(vmi, vlapic) \
(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
+#ifdef BHYVE_SNAPSHOT
+#define VM_SNAPSHOT_VMI(vmi, meta) \
+ (ops != NULL ? (*ops->vmsnapshot)(vmi, meta) : ENXIO)
+#define VM_SNAPSHOT_VMCX(vmi, meta, vcpuid) \
+ (ops != NULL ? (*ops->vmcx_snapshot)(vmi, meta, vcpuid) : ENXIO)
+#define VM_RESTORE_TSC(vmi, vcpuid, offset) \
+ (ops != NULL ? (*ops->vm_restore_tsc)(vmi, vcpuid, offset) : ENXIO)
+#endif
#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
#define fpu_stop_emulating() clts()
@@ -290,6 +307,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
vcpu->hostcpu = NOCPU;
vcpu->guestfpu = fpu_save_area_alloc();
vcpu->stats = vmm_stat_alloc();
+ vcpu->tsc_offset = 0;
}
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
@@ -2730,3 +2748,177 @@ vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
+
+#ifdef BHYVE_SNAPSHOT
+static int
+vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int i;
+ struct vcpu *vcpu;
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu = &vm->vcpu[i];
+
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
+ /* XXX we're cheating here, since the value of tsc_offset as
+ * saved here is actually the value of the guest's TSC value.
+ *
+ * It will be turned turned back into an actual offset when the
+ * TSC restore function is called
+ */
+ SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+static int
+vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int i;
+ uint64_t now;
+
+ ret = 0;
+ now = rdtsc();
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ /* XXX make tsc_offset take the value TSC proper as seen by the
+ * guest
+ */
+ for (i = 0; i < VM_MAXCPU; i++)
+ vm->vcpu[i].tsc_offset += now;
+ }
+
+ ret = vm_snapshot_vcpus(vm, meta);
+ if (ret != 0) {
+ printf("%s: failed to copy vm data to user buffer", __func__);
+ goto done;
+ }
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ /* XXX turn tsc_offset back into an offset; actual value is only
+ * required for restore; using it otherwise would be wrong
+ */
+ for (i = 0; i < VM_MAXCPU; i++)
+ vm->vcpu[i].tsc_offset -= now;
+ }
+
+done:
+ return (ret);
+}
+
+static int
+vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+ int i, error;
+
+ error = 0;
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ error = VM_SNAPSHOT_VMCX(vm->cookie, meta, i);
+ if (error != 0) {
+ printf("%s: failed to snapshot vmcs/vmcb data for "
+ "vCPU: %d; error: %d\n", __func__, i, error);
+ goto done;
+ }
+ }
+
+done:
+ return (error);
+}
+
+/*
+ * Save kernel-side structures to user-space for snapshotting.
+ */
+int
+vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+ int ret = 0;
+
+ switch (meta->dev_req) {
+ case STRUCT_VMX:
+ ret = VM_SNAPSHOT_VMI(vm->cookie, meta);
+ break;
+ case STRUCT_VMCX:
+ ret = vm_snapshot_vmcx(vm, meta);
+ break;
+ case STRUCT_VM:
+ ret = vm_snapshot_vm(vm, meta);
+ break;
+ case STRUCT_VIOAPIC:
+ ret = vioapic_snapshot(vm_ioapic(vm), meta);
+ break;
+ case STRUCT_VLAPIC:
+ ret = vlapic_snapshot(vm, meta);
+ break;
+ case STRUCT_VHPET:
+ ret = vhpet_snapshot(vm_hpet(vm), meta);
+ break;
+ case STRUCT_VATPIC:
+ ret = vatpic_snapshot(vm_atpic(vm), meta);
+ break;
+ case STRUCT_VATPIT:
+ ret = vatpit_snapshot(vm_atpit(vm), meta);
+ break;
+ case STRUCT_VPMTMR:
+ ret = vpmtmr_snapshot(vm_pmtmr(vm), meta);
+ break;
+ case STRUCT_VRTC:
+ ret = vrtc_snapshot(vm_rtc(vm), meta);
+ break;
+ default:
+ printf("%s: failed to find the requested type %#x\n",
+ __func__, meta->dev_req);
+ ret = (EINVAL);
+ }
+ return (ret);
+}
+
+int
+vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vcpu->tsc_offset = offset;
+
+ return (0);
+}
+
+int
+vm_restore_time(struct vm *vm)
+{
+ int error, i;
+ uint64_t now;
+ struct vcpu *vcpu;
+
+ now = rdtsc();
+
+ error = vhpet_restore_time(vm_hpet(vm));
+ if (error)
+ return (error);
+
+ for (i = 0; i < nitems(vm->vcpu); i++) {
+ vcpu = &vm->vcpu[i];
+
+ error = VM_RESTORE_TSC(vm->cookie, i, vcpu->tsc_offset - now);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+#endif
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 9818f300efec..e47b7081b795 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -31,6 +31,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_bhyve_snapshot.h"
+
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/jail.h>
@@ -53,8 +55,9 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
#include <machine/vmm.h>
-#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
@@ -381,6 +384,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_cpu_topology *topology;
uint64_t *regvals;
int *regnums;
+#ifdef BHYVE_SNAPSHOT
+ struct vm_snapshot_meta *snapshot_meta;
+#endif
error = vmm_priv_check(curthread->td_ucred);
if (error)
@@ -784,6 +790,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
&topology->threads, &topology->maxcpus);
error = 0;
break;
+#ifdef BHYVE_SNAPSHOT
+ case VM_SNAPSHOT_REQ:
+ snapshot_meta = (struct vm_snapshot_meta *)data;
+ error = vm_snapshot_req(sc->vm, snapshot_meta);
+ break;
+ case VM_RESTORE_TIME:
+ error = vm_restore_time(sc->vm);
+ break;
+#endif
default:
error = ENOTTY;
break;
diff --git a/sys/amd64/vmm/vmm_snapshot.c b/sys/amd64/vmm/vmm_snapshot.c
new file mode 100644
index 000000000000..c77bb05f76b7
--- /dev/null
+++ b/sys/amd64/vmm/vmm_snapshot.c
@@ -0,0 +1,141 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <machine/vmm_snapshot.h>
+
+void
+vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op)
+{
+ const char *opstr;
+
+ if (op == VM_SNAPSHOT_SAVE)
+ opstr = "save";
+ else if (op == VM_SNAPSHOT_RESTORE)
+ opstr = "restore";
+ else
+ opstr = "unknown";
+
+ printf("%s: snapshot-%s failed for %s\r\n", __func__, opstr, bufname);
+}
+
+int
+vm_snapshot_buf(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta)
+{
+ struct vm_snapshot_buffer *buffer;
+ int op;
+ void *nv_data;
+
+ nv_data = __DEVOLATILE(void *, data);
+ buffer = &meta->buffer;
+ op = meta->op;
+
+ if (buffer->buf_rem < data_size) {
+ printf("%s: buffer too small\r\n", __func__);
+ return (E2BIG);
+ }
+
+ if (op == VM_SNAPSHOT_SAVE)
+ copyout(nv_data, buffer->buf, data_size);
+ else if (op == VM_SNAPSHOT_RESTORE)
+ copyin(buffer->buf, nv_data, data_size);
+ else
+ return (EINVAL);
+
+ buffer->buf += data_size;
+ buffer->buf_rem -= data_size;
+
+ return (0);
+}
+
+size_t
+vm_get_snapshot_size(struct vm_snapshot_meta *meta)
+{
+ size_t length;
+ struct vm_snapshot_buffer *buffer;
+
+ buffer = &meta->buffer;
+
+ if (buffer->buf_size < buffer->buf_rem) {
+ printf("%s: Invalid buffer: size = %zu, rem = %zu\r\n",
+ __func__, buffer->buf_size, buffer->buf_rem);
+ length = 0;
+ } else {
+ length = buffer->buf_size - buffer->buf_rem;
+ }
+
+ return (length);
+}
+
+int
+vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta)
+{
+ struct vm_snapshot_buffer *buffer;
+ int op;
+ int ret;
+ void *_data = *(void **)(void *)&data;
+
+ buffer = &meta->buffer;
+ op = meta->op;
+
+ if (buffer->buf_rem < data_size) {
+ printf("%s: buffer too small\r\n", __func__);
+ ret = E2BIG;
+ goto done;
+ }
+
+ if (op == VM_SNAPSHOT_SAVE) {
+ ret = 0;
+ copyout(_data, buffer->buf, data_size);
+ } else if (op == VM_SNAPSHOT_RESTORE) {
+ ret = memcmp(_data, buffer->buf, data_size);
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+ buffer->buf += data_size;
+ buffer->buf_rem -= data_size;
+
+done:
+ return (ret);
+}
diff --git a/sys/conf/config.mk b/sys/conf/config.mk
index 6b405890458e..50188eee923b 100644
--- a/sys/conf/config.mk
+++ b/sys/conf/config.mk
@@ -15,6 +15,10 @@ opt_global.h:
@echo "#define MAC 1" >> ${.TARGET}
@echo "#define VIMAGE 1" >> ${.TARGET}
.endif
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+opt_bhyve_snapshot.h:
+ @echo "#define BHYVE_SNAPSHOT 1" > ${.TARGET}
+.endif
opt_bpf.h:
echo "#define DEV_BPF 1" > ${.TARGET}
.if ${MK_INET_SUPPORT} != "no"
@@ -45,6 +49,9 @@ KERN_OPTS.powerpc=NEW_PCIB DEV_PCI
KERN_OPTS=MROUTING IEEE80211_DEBUG \
IEEE80211_SUPPORT_MESH DEV_BPF \
${KERN_OPTS.${MACHINE}} ${KERN_OPTS_EXTRA}
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+KERN_OPTS+= BHYVE_SNAPSHOT
+.endif
.if ${MK_INET_SUPPORT} != "no"
KERN_OPTS+= INET TCP_OFFLOAD
.endif
diff --git a/sys/conf/kern.opts.mk b/sys/conf/kern.opts.mk
index 078d79913634..bc7ddfd06e7c 100644
--- a/sys/conf/kern.opts.mk
+++ b/sys/conf/kern.opts.mk
@@ -49,6 +49,7 @@ __DEFAULT_YES_OPTIONS = \
ZFS
__DEFAULT_NO_OPTIONS = \
+ BHYVE_SNAPSHOT \
EXTRA_TCP_STACKS \
KERNEL_RETPOLINE \
OFED \
diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64
index 8939ddaf6246..cd90747ba732 100644
--- a/sys/conf/options.amd64
+++ b/sys/conf/options.amd64
@@ -3,6 +3,7 @@
AUTO_EOI_1 opt_auto_eoi.h
AUTO_EOI_2 opt_auto_eoi.h
+BHYVE_SNAPSHOT
COUNT_XINVLTLB_HITS opt_smp.h
COUNT_IPIS opt_smp.h
MAXMEM
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
index 9471fc9074dc..b5d62c358272 100644
--- a/sys/modules/vmm/Makefile
+++ b/sys/modules/vmm/Makefile
@@ -1,8 +1,11 @@
# $FreeBSD$
+.include <kmod.opts.mk>
+
KMOD= vmm
-SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h
+SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h
+SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h
DPSRCS+= vmx_assym.h svm_assym.h
DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc
@@ -55,6 +58,10 @@ SRCS+= vmcb.c \
amdvi_hw.c \
svm_msr.c
+.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != ""
+SRCS+= vmm_snapshot.c
+.endif
+
CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o
OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h
diff --git a/tools/build/options/WITH_BHYVE_SNAPSHOT b/tools/build/options/WITH_BHYVE_SNAPSHOT
new file mode 100644
index 000000000000..7e673f51c8bb
--- /dev/null
+++ b/tools/build/options/WITH_BHYVE_SNAPSHOT
@@ -0,0 +1,7 @@
+.\" $FreeBSD$
+Set to include support for save and restore (snapshots) in
+.Xr bhyve 8
+and
+.Xr bhyvectl 8 .
+.Pp
+This option only affects amd64/amd64.
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 12bd477825bf..9a4460a1b90f 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -72,10 +72,17 @@ SRCS= \
spinup_ap.c \
iov.c
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+SRCS+= snapshot.c
+.endif
+
.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm
SRCS+= vmm_instruction_emul.c
LIBADD= vmmapi md pthread z util sbuf cam
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+LIBADD+= ucl xo
+.endif
.if ${MK_INET_SUPPORT} != "no"
CFLAGS+=-DINET
@@ -92,6 +99,14 @@ LIBADD+= crypto
CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000
CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii
CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+CFLAGS+= -I${SRCTOP}/contrib/libucl/include
+
+# Temporary disable capsicum, until we integrate checkpoint code with it.
+CFLAGS+= -DWITHOUT_CAPSICUM
+
+CFLAGS+= -DBHYVE_SNAPSHOT
+.endif
.ifdef GDB_LOG
CFLAGS+=-DGDB_LOG
diff --git a/usr.sbin/bhyve/Makefile.depend b/usr.sbin/bhyve/Makefile.depend
index 8d3ff079d277..8222ceb6ad25 100644
--- a/usr.sbin/bhyve/Makefile.depend
+++ b/usr.sbin/bhyve/Makefile.depend
@@ -13,8 +13,10 @@ DIRDEPS = \
lib/libcompiler_rt \
lib/libsbuf \
lib/libthr \
+ lib/libucl \
lib/libutil \
lib/libvmmapi \
+ lib/libxo \
lib/libz \
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index 1c1838c2e80c..a08f58f84b22 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
@@ -137,6 +138,10 @@ struct atkbdc_softc {
struct aux_dev aux;
};
+#ifdef BHYVE_SNAPSHOT
+static struct atkbdc_softc *atkbdc_sc = NULL;
+#endif
+
static void
atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
{
@@ -548,7 +553,48 @@ atkbdc_init(struct vmctx *ctx)
sc->ps2kbd_sc = ps2kbd_init(sc);
sc->ps2mouse_sc = ps2mouse_init(sc);
+
+#ifdef BHYVE_SNAPSHOT
+ assert(atkbdc_sc == NULL);
+ atkbdc_sc = sc;
+#endif
+}
+
+#ifdef BHYVE_SNAPSHOT
+int
+atkbdc_snapshot(struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram,
+ sizeof(atkbdc_sc->ram), meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer,
+ sizeof(atkbdc_sc->kbd.buffer), meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done);
+
+ ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta);
+ if (ret != 0)
+ goto done;
+
+ ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta);
+
+done:
+ return (ret);
}
+#endif
static void
atkbdc_dsdt(void)
diff --git a/usr.sbin/bhyve/atkbdc.h b/usr.sbin/bhyve/atkbdc.h
index 85c8a7141eb2..14c00ed9ae88 100644
--- a/usr.sbin/bhyve/atkbdc.h
+++ b/usr.sbin/bhyve/atkbdc.h
@@ -30,9 +30,14 @@
#define _ATKBDC_H_
struct atkbdc_softc;
+struct vm_snapshot_meta;
struct vmctx;
void atkbdc_init(struct vmctx *ctx);
void atkbdc_event(struct atkbdc_softc *sc, int iskbd);
+#ifdef BHYVE_SNAPSHOT
+int atkbdc_snapshot(struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _ATKBDC_H_ */
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 5d329a54491e..85e5f0256fbf 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd April 22, 2020
+.Dd May 04, 2020
.Dt BHYVE 8
.Os
.Sh NAME
@@ -61,6 +61,7 @@
.Sm on
.Oc
.Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu
+.Op Fl r Ar file
.Oo Fl s
.Sm off
.Cm help | Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf
@@ -191,6 +192,21 @@ to
.Em hostcpu .
.It Fl P
Force the guest virtual CPU to exit when a PAUSE instruction is detected.
+.It Fl r Ar file
+Resume a guest from a snapshot.
+The guest memory contents are restored from
+.Ar file ,
+and the guest device and vCPU state are restored from the file
+.Dq Ar file Ns .kern .
+.Pp
+Note that the current snapshot file format requires that the configuration of
+devices in the new VM match the VM from which the snapshot was taken by specifying the
+same
+.Op Fl s
+and
+.Op Fl l
+options.
+The count of vCPUs and memory configuration are read from the snapshot.
.It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf
Configure a virtual PCI slot and function.
.Pp
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 324e40d2cda2..8d73bd38cae4 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -36,7 +36,14 @@ __FBSDID("$FreeBSD$");
#include <sys/capsicum.h>
#endif
#include <sys/mman.h>
+#ifdef BHYVE_SNAPSHOT
+#include <sys/socket.h>
+#include <sys/stat.h>
+#endif
#include <sys/time.h>
+#ifdef BHYVE_SNAPSHOT
+#include <sys/un.h>
+#endif
#include <amd64/vmm/intel/vmcs.h>
@@ -51,6 +58,9 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <err.h>
#include <errno.h>
+#ifdef BHYVE_SNAPSHOT
+#include <fcntl.h>
+#endif
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
@@ -59,6 +69,12 @@ __FBSDID("$FreeBSD$");
#include <sysexits.h>
#include <stdbool.h>
#include <stdint.h>
+#ifdef BHYVE_SNAPSHOT
+#include <ucl.h>
+#include <unistd.h>
+
+#include <libxo/xo.h>
+#endif
#include <machine/vmm.h>
#ifndef WITHOUT_CAPSICUM
@@ -83,6 +99,9 @@ __FBSDID("$FreeBSD$");
#include "pci_irq.h"
#include "pci_lpc.h"
#include "smbiostbl.h"
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
#include "xmsr.h"
#include "spinup_ap.h"
#include "rtc.h"
@@ -163,7 +182,7 @@ static const char * const vmx_exit_reason_desc[] = {
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
-char *vmname;
+const char *vmname;
int guest_ncpus;
uint16_t cores, maxcpus, sockets, threads;
@@ -229,6 +248,9 @@ usage(int code)
" -H: vmexit from the guest on hlt\n"
" -l: LPC device configuration\n"
" -m: memory size in MB\n"
+#ifdef BHYVE_SNAPSHOT
+ " -r: path to checkpoint file\n"
+#endif
" -p: pin 'vcpu' to 'hostcpu'\n"
" -P: vmexit from the guest on pause\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
@@ -388,6 +410,14 @@ paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
return (vm_map_gpa(ctx, gaddr, len));
}
+#ifdef BHYVE_SNAPSHOT
+uintptr_t
+paddr_host2guest(struct vmctx *ctx, void *addr)
+{
+ return (vm_rev_map_gpa(ctx, addr));
+}
+#endif
+
int
fbsdrun_vmexit_on_pause(void)
{
@@ -422,6 +452,9 @@ fbsdrun_start_thread(void *param)
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_add(vcpu);
+#endif
if (gdb_port != 0)
gdb_cpu_add(vcpu);
@@ -697,11 +730,15 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
stats.vmexit_mtrap++;
- if (gdb_port == 0) {
- fprintf(stderr, "vm_loop: unexpected VMEXIT_MTRAP\n");
- exit(4);
- }
- gdb_cpu_mtrap(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_suspend(*pvcpu);
+#endif
+ if (gdb_port != 0)
+ gdb_cpu_mtrap(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_resume(*pvcpu);
+#endif
+
return (VMEXIT_CONTINUE);
}
@@ -778,11 +815,14 @@ static int
vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
- if (gdb_port == 0) {
- fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
- exit(4);
- }
- gdb_cpu_suspend(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_suspend(*pvcpu);
+#endif
+ if (gdb_port != 0)
+ gdb_cpu_suspend(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+ checkpoint_cpu_resume(*pvcpu);
+#endif
return (VMEXIT_CONTINUE);
}
@@ -997,6 +1037,22 @@ do_open(const char *vmname)
return (ctx);
}
+void
+spinup_vcpu(struct vmctx *ctx, int vcpu)
+{
+ int error;
+ uint64_t rip;
+
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ fbsdrun_set_capabilities(ctx, vcpu);
+ error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ assert(error == 0);
+
+ fbsdrun_addcpu(ctx, BSP, vcpu, rip);
+}
+
int
main(int argc, char *argv[])
{
@@ -1008,6 +1064,13 @@ main(int argc, char *argv[])
uint64_t rip;
size_t memsize;
char *optstr;
+#ifdef BHYVE_SNAPSHOT
+ char *restore_file;
+ struct restore_state rstate;
+ int vcpu;
+
+ restore_file = NULL;
+#endif
bvmcons = 0;
progname = basename(argv[0]);
@@ -1021,7 +1084,11 @@ main(int argc, char *argv[])
rtc_localtime = 1;
memflags = 0;
+#ifdef BHYVE_SNAPSHOT
+ optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:";
+#else
optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:";
+#endif
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
case 'a':
@@ -1067,6 +1134,11 @@ main(int argc, char *argv[])
"configuration '%s'", optarg);
}
break;
+#ifdef BHYVE_SNAPSHOT
+ case 'r':
+ restore_file = optarg;
+ break;
+#endif
case 's':
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
pci_print_supported_devices();
@@ -1128,12 +1200,50 @@ main(int argc, char *argv[])
argc -= optind;
argv += optind;
+#ifdef BHYVE_SNAPSHOT
+ if (argc > 1 || (argc == 0 && restore_file == NULL))
+ usage(1);
+
+ if (restore_file != NULL) {
+ error = load_restore_file(restore_file, &rstate);
+ if (error) {
+ fprintf(stderr, "Failed to read checkpoint info from "
+ "file: '%s'.\n", restore_file);
+ exit(1);
+ }
+ }
+
+ if (argc == 1) {
+ vmname = argv[0];
+ } else {
+ vmname = lookup_vmname(&rstate);
+ if (vmname == NULL) {
+ fprintf(stderr, "Cannot find VM name in restore file. "
+ "Please specify one.\n");
+ exit(1);
+ }
+ }
+#else
if (argc != 1)
usage(1);
vmname = argv[0];
+#endif
ctx = do_open(vmname);
+#ifdef BHYVE_SNAPSHOT
+ if (restore_file != NULL) {
+ guest_ncpus = lookup_guest_ncpus(&rstate);
+ memflags = lookup_memflags(&rstate);
+ memsize = lookup_memsize(&rstate);
+ }
+
+ if (guest_ncpus < 1) {
+ fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
+ exit(1);
+ }
+#endif
+
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
@@ -1200,6 +1310,40 @@ main(int argc, char *argv[])
assert(error == 0);
}
+#ifdef BHYVE_SNAPSHOT
+ if (restore_file != NULL) {
+ fprintf(stdout, "Pausing pci devs...\r\n");
+ if (vm_pause_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to pause PCI device state.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Restoring vm mem...\r\n");
+ if (restore_vm_mem(ctx, &rstate) != 0) {
+ fprintf(stderr, "Failed to restore VM memory.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Restoring pci devs...\r\n");
+ if (vm_restore_user_devs(ctx, &rstate) != 0) {
+ fprintf(stderr, "Failed to restore PCI device state.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Restoring kernel structs...\r\n");
+ if (vm_restore_kern_structs(ctx, &rstate) != 0) {
+ fprintf(stderr, "Failed to restore kernel structs.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Resuming pci devs...\r\n");
+ if (vm_resume_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to resume PCI device state.\n");
+ exit(1);
+ }
+ }
+#endif
+
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
@@ -1240,11 +1384,41 @@ main(int argc, char *argv[])
errx(EX_OSERR, "cap_enter() failed");
#endif
+#ifdef BHYVE_SNAPSHOT
+ if (restore_file != NULL)
+ destroy_restore_state(&rstate);
+
+ /*
+ * checkpointing thread for communication with bhyvectl
+ */
+ if (init_checkpoint_thread(ctx) < 0)
+ printf("Failed to start checkpoint thread!\r\n");
+
+ if (restore_file != NULL)
+ vm_restore_time(ctx);
+#endif
+
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, BSP, rip);
+#ifdef BHYVE_SNAPSHOT
+ /*
+ * If we restore a VM, start all vCPUs now (including APs), otherwise,
+ * let the guest OS to spin them up later via vmexits.
+ */
+ if (restore_file != NULL) {
+ for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
+ if (vcpu == BSP)
+ continue;
+
+ fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu);
+ spinup_vcpu(ctx, vcpu);
+ }
+ }
+#endif
+
/*
* Head off to the main event dispatch loop
*/
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index 0b23a6a5c3ae..0177baca14e9 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -38,9 +38,12 @@ struct vmctx;
extern int guest_ncpus;
extern uint16_t cores, sockets, threads;
extern char *guest_uuid_str;
-extern char *vmname;
+extern const char *vmname;
void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
+#ifdef BHYVE_SNAPSHOT
+uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr);
+#endif
void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 50e1eed12f90..4c91038ca765 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <unistd.h>
#include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
#include "bhyverun.h"
#include "debug.h"
@@ -105,9 +106,13 @@ struct blockif_ctxt {
int bc_psectsz;
int bc_psectoff;
int bc_closing;
+ int bc_paused;
+ int bc_work_count;
pthread_t bc_btid[BLOCKIF_NUMTHR];
pthread_mutex_t bc_mtx;
pthread_cond_t bc_cond;
+ pthread_cond_t bc_paused_cond;
+ pthread_cond_t bc_work_done_cond;
/* Request elements and free/pending/busy queues */
TAILQ_HEAD(, blockif_elem) bc_freeq;
@@ -210,6 +215,18 @@ blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
}
+static int
+blockif_flush_bc(struct blockif_ctxt *bc)
+{
+ if (bc->bc_ischr) {
+ if (ioctl(bc->bc_fd, DIOCGFLUSH))
+ return (errno);
+ } else if (fsync(bc->bc_fd))
+ return (errno);
+
+ return (0);
+}
+
static void
blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
{
@@ -300,11 +317,7 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
}
break;
case BOP_FLUSH:
- if (bc->bc_ischr) {
- if (ioctl(bc->bc_fd, DIOCGFLUSH))
- err = errno;
- } else if (fsync(bc->bc_fd))
- err = errno;
+ err = blockif_flush_bc(bc);
break;
case BOP_DELETE:
if (!bc->bc_candelete)
@@ -348,15 +361,30 @@ blockif_thr(void *arg)
pthread_mutex_lock(&bc->bc_mtx);
for (;;) {
- while (blockif_dequeue(bc, t, &be)) {
+ bc->bc_work_count++;
+
+ /* We cannot process work if the interface is paused */
+ while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
pthread_mutex_unlock(&bc->bc_mtx);
blockif_proc(bc, be, buf);
pthread_mutex_lock(&bc->bc_mtx);
blockif_complete(bc, be);
}
+
+ bc->bc_work_count--;
+
+ /* If none of the workers are busy, notify the main thread */
+ if (bc->bc_work_count == 0)
+ pthread_cond_broadcast(&bc->bc_work_done_cond);
+
/* Check ctxt status here to see if exit requested */
if (bc->bc_closing)
break;
+
+ /* Make all worker threads wait here if the device is paused */
+ while (bc->bc_paused)
+ pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
+
pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
@@ -565,6 +593,10 @@ blockif_open(const char *optstr, const char *ident)
bc->bc_psectoff = psectoff;
pthread_mutex_init(&bc->bc_mtx, NULL);
pthread_cond_init(&bc->bc_cond, NULL);
+ bc->bc_paused = 0;
+ bc->bc_work_count = 0;
+ pthread_cond_init(&bc->bc_paused_cond, NULL);
+ pthread_cond_init(&bc->bc_work_done_cond, NULL);
TAILQ_INIT(&bc->bc_freeq);
TAILQ_INIT(&bc->bc_pendq);
TAILQ_INIT(&bc->bc_busyq);
@@ -657,6 +689,8 @@ blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
assert(bc->bc_magic == BLOCKIF_SIG);
pthread_mutex_lock(&bc->bc_mtx);
+ /* XXX: not waiting while paused */
+
/*
* Check pending requests.
*/
@@ -855,3 +889,100 @@ blockif_candelete(struct blockif_ctxt *bc)
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_candelete);
}
+
+#ifdef BHYVE_SNAPSHOT
+void
+blockif_pause(struct blockif_ctxt *bc)
+{
+ assert(bc != NULL);
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_paused = 1;
+
+ /* The interface is paused. Wait for workers to finish their work */
+ while (bc->bc_work_count)
+ pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ if (blockif_flush_bc(bc))
+ fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
+ __func__);
+}
+
+void
+blockif_resume(struct blockif_ctxt *bc)
+{
+ assert(bc != NULL);
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_paused = 0;
+ /* resume the threads waiting for paused */
+ pthread_cond_broadcast(&bc->bc_paused_cond);
+ /* kick the threads after restore */
+ pthread_cond_broadcast(&bc->bc_cond);
+ pthread_mutex_unlock(&bc->bc_mtx);
+}
+
+int
+blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
+{
+ int i;
+ struct iovec *iov;
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
+
+ /*
+ * XXX: The callback and parameter must be filled by the virtualized
+ * device that uses the interface, during its init; we're not touching
+ * them here.
+ */
+
+ /* Snapshot the iovecs. */
+ for (i = 0; i < br->br_iovcnt; i++) {
+ iov = &br->br_iov[i];
+
+ SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
+
+ /* We assume the iov is a guest-mapped address. */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
+ false, meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+int
+blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ if (bc->bc_paused == 0) {
+ fprintf(stderr, "%s: Snapshot failed: "
+ "interface not paused.\r\n", __func__);
+ return (ENXIO);
+ }
+
+ pthread_mutex_lock(&bc->bc_mtx);
+
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
+
+done:
+ pthread_mutex_unlock(&bc->bc_mtx);
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index 75c016447ac2..f3b5b6938ef1 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -41,6 +41,9 @@
#include <sys/uio.h>
#include <sys/unistd.h>
+struct vm_snapshot_meta;
+
+
/*
* BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
* a single request. BLOCKIF_RING_MAX is the maxmimum number of
@@ -74,5 +77,13 @@ int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_close(struct blockif_ctxt *bc);
+#ifdef BHYVE_SNAPSHOT
+void blockif_pause(struct blockif_ctxt *bc);
+void blockif_resume(struct blockif_ctxt *bc);
+int blockif_snapshot_req(struct blockif_req *br,
+ struct vm_snapshot_meta *meta);
+int blockif_snapshot(struct blockif_ctxt *bc,
+ struct vm_snapshot_meta *meta);
+#endif
#endif /* _BLOCK_IF_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
index c0c69d37f311..649a6b09cb34 100644
--- a/usr.sbin/bhyve/mevent.c
+++ b/usr.sbin/bhyve/mevent.c
@@ -63,7 +63,7 @@ __FBSDID("$FreeBSD$");
#define MEVENT_MAX 64
-extern char *vmname;
+extern const char *vmname;
static pthread_t mevent_tid;
static int mevent_timid = 43;
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 23aee0fdac6b..49e6452a355d 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
#include <sys/ata.h>
#include <sys/endian.h>
+#include <machine/vmm_snapshot.h>
+
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -131,6 +133,7 @@ struct ahci_ioreq {
uint32_t done;
int slot;
int more;
+ int readop;
};
struct ahci_port {
@@ -724,6 +727,7 @@ ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
aior->slot = slot;
aior->len = len;
aior->done = done;
+ aior->readop = readop;
breq = &aior->io_req;
breq->br_offset = lba + done;
ahci_build_iov(p, aior, prdt, hdr->prdtl);
@@ -1420,6 +1424,7 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
aior->slot = slot;
aior->len = len;
aior->done = done;
+ aior->readop = 1;
breq = &aior->io_req;
breq->br_offset = lba + done;
ahci_build_iov(p, aior, prdt, hdr->prdtl);
@@ -2446,6 +2451,282 @@ pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
return (pci_ahci_init(ctx, pi, opts, 1));
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_ahci_snapshot_save_queues(struct ahci_port *port,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int idx;
+ struct ahci_ioreq *ioreq;
+
+ STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) {
+ idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+ }
+
+ idx = -1;
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+ TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) {
+ idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+ /*
+ * Snapshot only the busy requests; other requests are
+ * not valid.
+ */
+ ret = blockif_snapshot_req(&ioreq->io_req, meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to snapshot req\r\n",
+ __func__);
+ goto done;
+ }
+ }
+
+ idx = -1;
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+pci_ahci_snapshot_restore_queues(struct ahci_port *port,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ int idx;
+ struct ahci_ioreq *ioreq;
+
+ /* Empty the free queue before restoring. */
+ while (!STAILQ_EMPTY(&port->iofhd))
+ STAILQ_REMOVE_HEAD(&port->iofhd, io_flist);
+
+ /* Restore the free queue. */
+ while (1) {
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+ if (idx == -1)
+ break;
+
+ STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist);
+ }
+
+ /* Restore the busy queue. */
+ while (1) {
+ SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+ if (idx == -1)
+ break;
+
+ ioreq = &port->ioreq[idx];
+ TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist);
+
+ /*
+ * Restore only the busy requests; other requests are
+ * not valid.
+ */
+ ret = blockif_snapshot_req(&ioreq->io_req, meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to restore request\r\n",
+ __func__);
+ goto done;
+ }
+
+ /* Re-enqueue the requests in the block interface. */
+ if (ioreq->readop)
+ ret = blockif_read(port->bctx, &ioreq->io_req);
+ else
+ ret = blockif_write(port->bctx, &ioreq->io_req);
+
+ if (ret != 0) {
+ fprintf(stderr,
+ "%s: failed to re-enqueue request\r\n",
+ __func__);
+ goto done;
+ }
+ }
+
+done:
+ return (ret);
+}
+
+static int
+pci_ahci_snapshot(struct vm_snapshot_meta *meta)
+{
+ int i, j, ret;
+ void *bctx;
+ struct pci_devinst *pi;
+ struct pci_ahci_softc *sc;
+ struct ahci_port *port;
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_ioreq *ioreq;
+
+ pi = meta->dev_data;
+ sc = pi->pi_arg;
+
+ /* TODO: add mtx lock/unlock */
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done);
+
+ for (i = 0; i < MAX_PORTS; i++) {
+ port = &sc->port[i];
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ bctx = port->bctx;
+
+ SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done);
+
+ /* Mostly for restore; save is ensured by the lines above. */
+ if (((bctx == NULL) && (port->bctx != NULL)) ||
+ ((bctx != NULL) && (port->bctx == NULL))) {
+ fprintf(stderr, "%s: ports not matching\r\n", __func__);
+ ret = EINVAL;
+ goto done;
+ }
+
+ if (port->bctx == NULL)
+ continue;
+
+ if (port->port != i) {
+ fprintf(stderr, "%s: ports not matching: "
+ "actual: %d expected: %d\r\n",
+ __func__, port->port, i);
+ ret = EINVAL;
+ goto done;
+ }
+
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst,
+ AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta,
+ ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(port->ident, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
+
+ for (j = 0; j < port->ioqsz; j++) {
+ ioreq = &port->ioreq[j];
+
+ /* blockif_req snapshot done only for busy requests. */
+ hdr = (struct ahci_cmd_hdr *)(port->cmd_lst +
+ ioreq->slot * AHCI_CL_SIZE);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis,
+ 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry),
+ false, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done);
+ }
+
+ /* Perform save / restore specific operations. */
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ ret = pci_ahci_snapshot_save_queues(port, meta);
+ if (ret != 0)
+ goto done;
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ ret = pci_ahci_snapshot_restore_queues(port, meta);
+ if (ret != 0)
+ goto done;
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+ ret = blockif_snapshot(port->bctx, meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to restore blockif\r\n",
+ __func__);
+ goto done;
+ }
+ }
+
+done:
+ return (ret);
+}
+
+static int
+pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct pci_ahci_softc *sc;
+ struct blockif_ctxt *bctxt;
+ int i;
+
+ sc = pi->pi_arg;
+
+ for (i = 0; i < MAX_PORTS; i++) {
+ bctxt = sc->port[i].bctx;
+ if (bctxt == NULL)
+ continue;
+
+ blockif_pause(bctxt);
+ }
+
+ return (0);
+}
+
+static int
+pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct pci_ahci_softc *sc;
+ struct blockif_ctxt *bctxt;
+ int i;
+
+ sc = pi->pi_arg;
+
+ for (i = 0; i < MAX_PORTS; i++) {
+ bctxt = sc->port[i].bctx;
+ if (bctxt == NULL)
+ continue;
+
+ blockif_resume(bctxt);
+ }
+
+ return (0);
+}
+#endif
+
/*
* Use separate emulation names to distinguish drive and atapi devices
*/
@@ -2453,7 +2734,12 @@ struct pci_devemu pci_de_ahci = {
.pe_emu = "ahci",
.pe_init = pci_ahci_hd_init,
.pe_barwrite = pci_ahci_write,
- .pe_barread = pci_ahci_read
+ .pe_barread = pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_ahci_snapshot,
+ .pe_pause = pci_ahci_pause,
+ .pe_resume = pci_ahci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_ahci);
@@ -2461,7 +2747,12 @@ struct pci_devemu pci_de_ahci_hd = {
.pe_emu = "ahci-hd",
.pe_init = pci_ahci_hd_init,
.pe_barwrite = pci_ahci_write,
- .pe_barread = pci_ahci_read
+ .pe_barread = pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_ahci_snapshot,
+ .pe_pause = pci_ahci_pause,
+ .pe_resume = pci_ahci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_ahci_hd);
@@ -2469,6 +2760,11 @@ struct pci_devemu pci_de_ahci_cd = {
.pe_emu = "ahci-cd",
.pe_init = pci_ahci_atapi_init,
.pe_barwrite = pci_ahci_write,
- .pe_barread = pci_ahci_read
+ .pe_barread = pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_ahci_snapshot,
+ .pe_pause = pci_ahci_pause,
+ .pe_resume = pci_ahci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c
index dca981be85fa..c1443b6aa613 100644
--- a/usr.sbin/bhyve/pci_e82545.c
+++ b/usr.sbin/bhyve/pci_e82545.c
@@ -46,6 +46,8 @@ __FBSDID("$FreeBSD$");
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
#endif
+#include <machine/vmm_snapshot.h>
+
#include <err.h>
#include <errno.h>
#include <fcntl.h>
@@ -2378,11 +2380,168 @@ e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
return (0);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+e82545_snapshot(struct vm_snapshot_meta *meta)
+{
+ int i;
+ int ret;
+ struct e82545_softc *sc;
+ struct pci_devinst *pi;
+ uint64_t bitmap_value;
+
+ pi = meta->dev_data;
+ sc = pi->pi_arg;
+
+ /* esc_mevp and esc_mevpitr should be reinitiated at init. */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done);
+
+ /* General */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done);
+
+ /* Interrupt control */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done);
+
+ /*
+ * Transmit
+ *
+ * The fields in the unions are in superposition to access certain
+ * bytes in the larger uint variables.
+ * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1]
+ */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done);
+
+ /* Has dependency on esc_TDLEN; reoreder of fields from struct. */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN,
+ true, meta, ret, done);
+
+ /* L2 frame acceptance */
+ for (i = 0; i < nitems(sc->esc_uni); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done);
+ }
+
+ SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast),
+ meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan),
+ meta, ret, done);
+
+ /* Receive */
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done);
+
+ /* Has dependency on esc_RDLEN; reoreder of fields from struct. */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN,
+ true, meta, ret, done);
+
+ /* IO Port register access */
+ SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done);
+
+ /* Shadow copy of MDIC */
+ SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done);
+
+ /* Shadow copy of EECD */
+ SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done);
+
+ /* Latest NVM in/out */
+ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done);
+
+ /* Stats */
+ SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size),
+ meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size),
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ bitmap_value = sc->nvm_bits;
+ SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done);
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ sc->nvm_bits = bitmap_value;
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ bitmap_value = sc->nvm_bits;
+ SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done);
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ sc->nvm_bits = bitmap_value;
+
+ /* EEPROM data */
+ SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data),
+ meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
struct pci_devemu pci_de_e82545 = {
.pe_emu = "e1000",
.pe_init = e82545_init,
.pe_barwrite = e82545_write,
- .pe_barread = e82545_read
+ .pe_barread = e82545_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = e82545_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_e82545);
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 145b33b5ffd2..e4b83896241e 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <stdbool.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
#include "acpi.h"
@@ -1962,6 +1963,191 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+#ifdef BHYVE_SNAPSHOT
+/*
+ * Saves/restores PCI device emulated state. Returns 0 on success.
+ */
+static int
+pci_snapshot_pci_dev(struct vm_snapshot_meta *meta)
+{
+ struct pci_devinst *pi;
+ int i;
+ int ret;
+
+ pi = meta->dev_data;
+
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata),
+ meta, ret, done);
+
+ for (i = 0; i < nitems(pi->pi_bar); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done);
+ }
+
+ /* Restore MSI-X table. */
+ for (i = 0; i < pi->pi_msix.table_count; i++) {
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control,
+ meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+static int
+pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde,
+ struct pci_devinst **pdi)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct funcinfo *fi;
+ int bus, slot, func;
+
+ assert(dev_name != NULL);
+ assert(pde != NULL);
+ assert(pdi != NULL);
+
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_name == NULL)
+ continue;
+ if (strcmp(dev_name, fi->fi_name))
+ continue;
+
+ *pde = pci_emul_finddev(fi->fi_name);
+ assert(*pde != NULL);
+
+ *pdi = fi->fi_devi;
+ return (0);
+ }
+ }
+ }
+
+ return (EINVAL);
+}
+
+int
+pci_snapshot(struct vm_snapshot_meta *meta)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(meta->dev_name != NULL);
+
+ ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi);
+ if (ret != 0) {
+ fprintf(stderr, "%s: no such name: %s\r\n",
+ __func__, meta->dev_name);
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ return (0);
+ }
+
+ meta->dev_data = pdi;
+
+ if (pde->pe_snapshot == NULL) {
+ fprintf(stderr, "%s: not implemented yet for: %s\r\n",
+ __func__, meta->dev_name);
+ return (-1);
+ }
+
+ ret = pci_snapshot_pci_dev(meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to snapshot pci dev\r\n",
+ __func__);
+ return (-1);
+ }
+
+ ret = (*pde->pe_snapshot)(meta);
+
+ return (ret);
+}
+
+int
+pci_pause(struct vmctx *ctx, const char *dev_name)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(dev_name != NULL);
+
+ ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
+ if (ret != 0) {
+ /*
+ * It is possible to call this function without
+ * checking that the device is inserted first.
+ */
+ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
+ return (0);
+ }
+
+ if (pde->pe_pause == NULL) {
+ /* The pause/resume functionality is optional. */
+ fprintf(stderr, "%s: not implemented for: %s\n",
+ __func__, dev_name);
+ return (0);
+ }
+
+ return (*pde->pe_pause)(ctx, pdi);
+}
+
+int
+pci_resume(struct vmctx *ctx, const char *dev_name)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(dev_name != NULL);
+
+ ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
+ if (ret != 0) {
+ /*
+ * It is possible to call this function without
+ * checking that the device is inserted first.
+ */
+ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
+ return (0);
+ }
+
+ if (pde->pe_resume == NULL) {
+ /* The pause/resume functionality is optional. */
+ fprintf(stderr, "%s: not implemented for: %s\n",
+ __func__, dev_name);
+ return (0);
+ }
+
+ return (*pde->pe_resume)(ctx, pdi);
+}
+#endif
+
#define PCI_EMUL_TEST
#ifdef PCI_EMUL_TEST
/*
@@ -1970,7 +2156,7 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
#define DIOSZ 8
#define DMEMSZ 4096
struct pci_emul_dsoftc {
- uint8_t ioregs[DIOSZ];
+ uint8_t ioregs[DIOSZ];
uint8_t memregs[2][DMEMSZ];
};
@@ -2062,7 +2248,7 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
} else {
printf("diow: memw unknown size %d\n", size);
}
-
+
/*
* magic interrupt ??
*/
@@ -2087,7 +2273,7 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
offset, size);
return (0);
}
-
+
value = 0;
if (size == 1) {
value = sc->ioregs[offset];
@@ -2106,7 +2292,7 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
offset, size);
return (0);
}
-
+
i = baridx - 1; /* 'memregs' index */
if (size == 1) {
@@ -2131,11 +2317,23 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
return (value);
}
+#ifdef BHYVE_SNAPSHOT
+int
+pci_emul_snapshot(struct vm_snapshot_meta *meta)
+{
+
+ return (0);
+}
+#endif
+
struct pci_devemu pci_dummy = {
.pe_emu = "dummy",
.pe_init = pci_emul_dinit,
.pe_barwrite = pci_emul_diow,
- .pe_barread = pci_emul_dior
+ .pe_barread = pci_emul_dior,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_emul_snapshot,
+#endif
};
PCI_EMUL_SET(pci_dummy);
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index fba2e8845af8..1cefa5ed042d 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -45,6 +45,7 @@
struct vmctx;
struct pci_devinst;
struct memory_region;
+struct vm_snapshot_meta;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
@@ -71,6 +72,11 @@ struct pci_devemu {
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
+
+ /* Save/restore device state */
+ int (*pe_snapshot)(struct vm_snapshot_meta *meta);
+ int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi);
+ int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
@@ -246,6 +252,11 @@ void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
+#ifdef BHYVE_SNAPSHOT
+int pci_snapshot(struct vm_snapshot_meta *meta);
+int pci_pause(struct vmctx *ctx, const char *dev_name);
+int pci_resume(struct vmctx *ctx, const char *dev_name);
+#endif
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c
index 8961875356da..0bd740a0908c 100644
--- a/usr.sbin/bhyve/pci_fbuf.c
+++ b/usr.sbin/bhyve/pci_fbuf.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mman.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
#include <stdio.h>
@@ -440,10 +441,26 @@ done:
return (error);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_fbuf_snapshot(struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err);
+
+err:
+ return (ret);
+}
+#endif
+
struct pci_devemu pci_fbuf = {
.pe_emu = "fbuf",
.pe_init = pci_fbuf_init,
.pe_barwrite = pci_fbuf_write,
- .pe_barread = pci_fbuf_read
+ .pe_barread = pci_fbuf_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_fbuf_snapshot,
+#endif
};
PCI_EMUL_SET(pci_fbuf);
diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c
index 1e4b513ec494..4ebdd7039cbc 100644
--- a/usr.sbin/bhyve/pci_lpc.c
+++ b/usr.sbin/bhyve/pci_lpc.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
#include <stdio.h>
#include <stdlib.h>
@@ -452,12 +453,35 @@ lpc_pirq_routed(void)
pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_lpc_snapshot(struct vm_snapshot_meta *meta)
+{
+ int unit, ret;
+ struct uart_softc *sc;
+
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ sc = lpc_uart_softc[unit].uart_softc;
+
+ ret = uart_snapshot(sc, meta);
+ if (ret != 0)
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
+
struct pci_devemu pci_de_lpc = {
.pe_emu = "lpc",
.pe_init = pci_lpc_init,
.pe_write_dsdt = pci_lpc_write_dsdt,
.pe_cfgwrite = pci_lpc_cfgwrite,
.pe_barwrite = pci_lpc_write,
- .pe_barread = pci_lpc_read
+ .pe_barread = pci_lpc_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_lpc_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_lpc);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 04ff7718c333..4fd8943efffa 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -39,6 +39,8 @@ __FBSDID("$FreeBSD$");
#include <sys/ioctl.h>
#include <sys/disk.h>
+#include <machine/vmm_snapshot.h>
+
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -198,6 +200,11 @@ static void pci_vtblk_reset(void *);
static void pci_vtblk_notify(void *, struct vqueue_info *);
static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+#ifdef BHYVE_SNAPSHOT
+static void pci_vtblk_pause(void *);
+static void pci_vtblk_resume(void *);
+static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *);
+#endif
static struct virtio_consts vtblk_vi_consts = {
"vtblk", /* our name */
@@ -209,6 +216,11 @@ static struct virtio_consts vtblk_vi_consts = {
pci_vtblk_cfgwrite, /* write PCI config */
NULL, /* apply negotiated features */
VTBLK_S_HOSTCAPS, /* our capabilities */
+#ifdef BHYVE_SNAPSHOT
+ pci_vtblk_pause, /* pause blockif threads */
+ pci_vtblk_resume, /* resume blockif threads */
+ pci_vtblk_snapshot, /* save / restore device state */
+#endif
};
static void
@@ -241,6 +253,40 @@ pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
vq_endchains(&sc->vbsc_vq, 0);
}
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_vtblk_pause(void *vsc)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device pause requested !\n"));
+ blockif_pause(sc->bc);
+}
+
+static void
+pci_vtblk_resume(void *vsc)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device resume requested !\n"));
+ blockif_resume(sc->bc);
+}
+
+static int
+pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct pci_vtblk_softc *sc = vsc;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident),
+ meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
static void
pci_vtblk_done(struct blockif_req *br, int err)
{
@@ -523,6 +569,9 @@ struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_barwrite = vi_pci_write,
- .pe_barread = vi_pci_read
+ .pe_barread = vi_pci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = vi_pci_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index adc273128585..a0fcd9055e65 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
+#include <machine/vmm_snapshot.h>
#include <net/ethernet.h>
#include <net/if.h> /* IFNAMSIZ */
@@ -134,6 +135,11 @@ static void pci_vtnet_reset(void *);
static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
static void pci_vtnet_neg_features(void *, uint64_t);
+#ifdef BHYVE_SNAPSHOT
+static void pci_vtnet_pause(void *);
+static void pci_vtnet_resume(void *);
+static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *);
+#endif
static struct virtio_consts vtnet_vi_consts = {
"vtnet", /* our name */
@@ -145,6 +151,11 @@ static struct virtio_consts vtnet_vi_consts = {
pci_vtnet_cfgwrite, /* write PCI config */
pci_vtnet_neg_features, /* apply negotiated features */
VTNET_S_HOSTCAPS, /* our capabilities */
+#ifdef BHYVE_SNAPSHOT
+ pci_vtnet_pause, /* pause rx/tx threads */
+ pci_vtnet_resume, /* resume rx/tx threads */
+ pci_vtnet_snapshot, /* save / restore device state */
+#endif
};
static void
@@ -740,10 +751,80 @@ pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen);
}
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_vtnet_pause(void *vsc)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device pause requested !\n"));
+
+ /* Acquire the RX lock to block RX processing. */
+ pthread_mutex_lock(&sc->rx_mtx);
+
+ /* Wait for the transmit thread to finish its processing. */
+ pthread_mutex_lock(&sc->tx_mtx);
+ while (sc->tx_in_progress) {
+ pthread_mutex_unlock(&sc->tx_mtx);
+ usleep(10000);
+ pthread_mutex_lock(&sc->tx_mtx);
+ }
+}
+
+static void
+pci_vtnet_resume(void *vsc)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device resume requested !\n"));
+
+ pthread_mutex_unlock(&sc->tx_mtx);
+ /* The RX lock should have been acquired in vtnet_pause. */
+ pthread_mutex_unlock(&sc->rx_mtx);
+}
+
+static int
+pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device snapshot requested !\n"));
+
+ /*
+ * Queues and consts should have been saved by the more generic
+ * vi_pci_snapshot function. We need to save only our features and
+ * config.
+ */
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done);
+
+ /* Force reapply negociated features at restore time */
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ pci_vtnet_neg_features(sc, sc->vsc_features);
+ netbe_rx_enable(sc->vsc_be);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
static struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,
- .pe_barread = vi_pci_read
+ .pe_barread = vi_pci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = vi_pci_snapshot,
+ .pe_pause = vi_pci_pause,
+ .pe_resume = vi_pci_resume,
+#endif
};
PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c
index 672f35c91ef8..0847d5bb38b5 100644
--- a/usr.sbin/bhyve/pci_xhci.c
+++ b/usr.sbin/bhyve/pci_xhci.c
@@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$");
#include <pthread.h>
#include <unistd.h>
+#include <machine/vmm_snapshot.h>
+
#include <dev/usb/usbdi.h>
#include <dev/usb/usb.h>
#include <dev/usb/usb_freebsd.h>
@@ -151,6 +153,8 @@ static int xhci_debug = 0;
#define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \
(((b) & ((m) << (s)))))
+#define SNAP_DEV_NAME_LEN 128
+
struct pci_xhci_trb_ring {
uint64_t ringaddr; /* current dequeue guest address */
uint32_t ccs; /* consumer cycle state */
@@ -286,9 +290,10 @@ struct pci_xhci_softc {
#define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH)
+#define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \
+ (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1)))
#define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \
- (a), \
- XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1)))
+ (a), XHCI_GADDR_SIZE(a))
static int xhci_in_use;
@@ -2855,12 +2860,265 @@ done:
return (error);
}
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[])
+{
+ int i, j;
+ struct pci_xhci_dev_emu *dev, *slot;
+
+ memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS);
+
+ for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+ for (j = 1; j <= XHCI_MAX_DEVS; j++) {
+ slot = XHCI_SLOTDEV_PTR(sc, i);
+ dev = XHCI_DEVINST_PTR(sc, j);
+
+ if (slot == dev)
+ maps[i] = j;
+ }
+ }
+}
+static int
+pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev,
+ int idx, struct vm_snapshot_meta *meta)
+{
+ int k;
+ int ret;
+ struct usb_data_xfer *xfer;
+ struct usb_data_xfer_block *xfer_block;
+
+ /* some sanity checks */
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ xfer = dev->eps[idx].ep_xfer;
+
+ SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done);
+ if (xfer == NULL) {
+ ret = 0;
+ goto done;
+ }
+
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ pci_xhci_init_ep(dev, idx);
+ xfer = dev->eps[idx].ep_xfer;
+ }
+
+ /* save / restore proper */
+ for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) {
+ xfer_block = &xfer->data[k];
+
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf,
+ XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret,
+ done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done);
+ if (xfer->ureq) {
+ /* xfer->ureq is not allocated at restore time */
+ if (meta->op == VM_SNAPSHOT_RESTORE)
+ xfer->ureq = malloc(sizeof(struct usb_device_request));
+
+ SNAPSHOT_BUF_OR_LEAVE(xfer->ureq,
+ sizeof(struct usb_device_request),
+ meta, ret, done);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+pci_xhci_snapshot(struct vm_snapshot_meta *meta)
+{
+ int i, j;
+ int ret;
+ int restore_idx;
+ struct pci_devinst *pi;
+ struct pci_xhci_softc *sc;
+ struct pci_xhci_portregs *port;
+ struct pci_xhci_dev_emu *dev;
+ char dname[SNAP_DEV_NAME_LEN];
+ int maps[XHCI_MAX_SLOTS + 1];
+
+ pi = meta->dev_data;
+ sc = pi->pi_arg;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done);
+
+ /* opregs */
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done);
+
+ /* opregs.cr_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p,
+ XHCI_GADDR_SIZE(sc->opregs.cr_p), false, meta, ret, done);
+
+ /* opregs.dcbaa_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p,
+ XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), false, meta, ret, done);
+
+ /* rtsregs */
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done);
+
+ /* rtsregs.intrreg */
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done);
+
+ /* rtsregs.erstba_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p,
+ XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), false, meta, ret, done);
+
+ /* rtsregs.erst_p */
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p,
+ XHCI_GADDR_SIZE(sc->rtsregs.erst_p), false, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done);
+
+ /* sanity checking */
+ for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+ dev = XHCI_DEVINST_PTR(sc, i);
+ if (dev == NULL)
+ continue;
+
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ restore_idx = i;
+ SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done);
+
+ /* check if the restored device (when restoring) is sane */
+ if (restore_idx != i) {
+ fprintf(stderr, "%s: idx not matching: actual: %d, "
+ "expected: %d\r\n", __func__, restore_idx, i);
+ ret = EINVAL;
+ goto done;
+ }
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ memset(dname, 0, sizeof(dname));
+ strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1);
+ }
+
+ SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_RESTORE) {
+ dname[sizeof(dname) - 1] = '\0';
+ if (strcmp(dev->dev_ue->ue_emu, dname)) {
+ fprintf(stderr, "%s: device names mismatch: "
+ "actual: %s, expected: %s\r\n",
+ __func__, dname, dev->dev_ue->ue_emu);
+
+ ret = EINVAL;
+ goto done;
+ }
+ }
+ }
+
+ /* portregs */
+ for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+ port = XHCI_PORTREG_PTR(sc, i);
+ dev = XHCI_DEVINST_PTR(sc, i);
+
+ if (dev == NULL)
+ continue;
+
+ SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done);
+ }
+
+ /* slots */
+ if (meta->op == VM_SNAPSHOT_SAVE)
+ pci_xhci_map_devs_slots(sc, maps);
+
+ for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+ SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done);
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ dev = XHCI_SLOTDEV_PTR(sc, i);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ if (maps[i] != 0)
+ dev = XHCI_DEVINST_PTR(sc, maps[i]);
+ else
+ dev = NULL;
+
+ XHCI_SLOTDEV_PTR(sc, i) = dev;
+ } else {
+ /* error */
+ ret = EINVAL;
+ goto done;
+ }
+
+ if (dev == NULL)
+ continue;
+
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx,
+ XHCI_GADDR_SIZE(dev->dev_ctx), false, meta, ret, done);
+
+ for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) {
+ ret = pci_xhci_snapshot_ep(sc, dev, j, meta);
+ if (ret != 0)
+ goto done;
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done);
+
+ /* devices[i]->dev_sc */
+ dev->dev_ue->ue_snapshot(dev->dev_sc, meta);
+
+ /* devices[i]->hci */
+ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done);
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
struct pci_devemu pci_de_xhci = {
.pe_emu = "xhci",
.pe_init = pci_xhci_init,
.pe_barwrite = pci_xhci_write,
- .pe_barread = pci_xhci_read
+ .pe_barread = pci_xhci_read,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_xhci_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_xhci);
diff --git a/usr.sbin/bhyve/ps2kbd.c b/usr.sbin/bhyve/ps2kbd.c
index 3e6a1b67ca38..ef20fa47e0a9 100644
--- a/usr.sbin/bhyve/ps2kbd.c
+++ b/usr.sbin/bhyve/ps2kbd.c
@@ -32,10 +32,13 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
+#include <machine/vmm_snapshot.h>
+
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <strings.h>
#include <pthread.h>
#include <pthread_np.h>
@@ -382,3 +385,17 @@ ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
return (sc);
}
+#ifdef BHYVE_SNAPSHOT
+int
+ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
+
diff --git a/usr.sbin/bhyve/ps2kbd.h b/usr.sbin/bhyve/ps2kbd.h
index 17be6d046673..3cf87be1b7f3 100644
--- a/usr.sbin/bhyve/ps2kbd.h
+++ b/usr.sbin/bhyve/ps2kbd.h
@@ -32,10 +32,15 @@
#define _PS2KBD_H_
struct atkbdc_softc;
+struct vm_snapshot_meta;
struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc);
int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val);
void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val);
+#ifdef BHYVE_SNAPSHOT
+int ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _PS2KBD_H_ */
diff --git a/usr.sbin/bhyve/ps2mouse.c b/usr.sbin/bhyve/ps2mouse.c
index f42d2e726023..afe817710f30 100644
--- a/usr.sbin/bhyve/ps2mouse.c
+++ b/usr.sbin/bhyve/ps2mouse.c
@@ -32,10 +32,13 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
+#include <machine/vmm_snapshot.h>
+
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <strings.h>
#include <pthread.h>
#include <pthread_np.h>
@@ -416,4 +419,23 @@ ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
return (sc);
}
-
+#ifdef BHYVE_SNAPSHOT
+int
+ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->resolution, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->sampling_rate, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ctrlenable, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cur_x, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->cur_y, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->delta_x, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->delta_y, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/ps2mouse.h b/usr.sbin/bhyve/ps2mouse.h
index 59430b01e2b1..4ae755ef4411 100644
--- a/usr.sbin/bhyve/ps2mouse.h
+++ b/usr.sbin/bhyve/ps2mouse.h
@@ -32,6 +32,7 @@
#define _PS2MOUSE_H_
struct atkbdc_softc;
+struct vm_snapshot_meta;
struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
@@ -40,4 +41,8 @@ void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert);
void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable);
int ps2mouse_fifocnt(struct ps2mouse_softc *sc);
+#ifdef BHYVE_SNAPSHOT
+int ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta);
+#endif
+
#endif /* _PS2MOUSE_H_ */
diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c
new file mode 100644
index 000000000000..22bfd8d28a61
--- /dev/null
+++ b/usr.sbin/bhyve/snapshot.c
@@ -0,0 +1,1742 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/un.h>
+
+#include <machine/atomic.h>
+#include <machine/segments.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <signal.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+#include <stdbool.h>
+#include <sys/ioctl.h>
+
+#include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
+#include <machine/vmm_snapshot.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "atkbdc.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "fwctl.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "smbiostbl.h"
+#include "snapshot.h"
+#include "xmsr.h"
+#include "spinup_ap.h"
+#include "rtc.h"
+
+#include <libxo/xo.h>
+#include <ucl.h>
+
+struct spinner_info {
+ const size_t *crtval;
+ const size_t maxval;
+ const size_t total;
+};
+
+extern int guest_ncpus;
+
+static struct winsize winsize;
+static sig_t old_winch_handler;
+
+#define KB (1024UL)
+#define MB (1024UL * KB)
+#define GB (1024UL * MB)
+
+#define SNAPSHOT_CHUNK (4 * MB)
+#define PROG_BUF_SZ (8192)
+
+#define BHYVE_RUN_DIR "/var/run/bhyve"
+#define CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint"
+#define MAX_VMNAME 100
+
+#define MAX_MSG_SIZE 1024
+
+#define SNAPSHOT_BUFFER_SIZE (20 * MB)
+
+#define JSON_STRUCT_ARR_KEY "structs"
+#define JSON_DEV_ARR_KEY "devices"
+#define JSON_BASIC_METADATA_KEY "basic metadata"
+#define JSON_SNAPSHOT_REQ_KEY "snapshot_req"
+#define JSON_SIZE_KEY "size"
+#define JSON_FILE_OFFSET_KEY "file_offset"
+
+#define JSON_NCPUS_KEY "ncpus"
+#define JSON_VMNAME_KEY "vmname"
+#define JSON_MEMSIZE_KEY "memsize"
+#define JSON_MEMFLAGS_KEY "memflags"
+
+#define min(a,b) \
+({ \
+ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a < _b ? _a : _b; \
+ })
+
+const struct vm_snapshot_dev_info snapshot_devs[] = {
+ { "atkbdc", atkbdc_snapshot, NULL, NULL },
+ { "virtio-net", pci_snapshot, pci_pause, pci_resume },
+ { "virtio-blk", pci_snapshot, pci_pause, pci_resume },
+ { "lpc", pci_snapshot, NULL, NULL },
+ { "fbuf", pci_snapshot, NULL, NULL },
+ { "xhci", pci_snapshot, NULL, NULL },
+ { "e1000", pci_snapshot, NULL, NULL },
+ { "ahci", pci_snapshot, pci_pause, pci_resume },
+ { "ahci-hd", pci_snapshot, pci_pause, pci_resume },
+ { "ahci-cd", pci_snapshot, NULL, NULL },
+};
+
+const struct vm_snapshot_kern_info snapshot_kern_structs[] = {
+ { "vhpet", STRUCT_VHPET },
+ { "vm", STRUCT_VM },
+ { "vmx", STRUCT_VMX },
+ { "vioapic", STRUCT_VIOAPIC },
+ { "vlapic", STRUCT_VLAPIC },
+ { "vmcx", STRUCT_VMCX },
+ { "vatpit", STRUCT_VATPIT },
+ { "vatpic", STRUCT_VATPIC },
+ { "vpmtmr", STRUCT_VPMTMR },
+ { "vrtc", STRUCT_VRTC },
+};
+
+static cpuset_t vcpus_active, vcpus_suspended;
+static pthread_mutex_t vcpu_lock;
+static pthread_cond_t vcpus_idle, vcpus_can_run;
+static bool checkpoint_active;
+
+/*
+ * TODO: Harden this function and all of its callers since 'base_str' is a user
+ * provided string.
+ */
+static char *
+strcat_extension(const char *base_str, const char *ext)
+{
+ char *res;
+ size_t base_len, ext_len;
+
+ base_len = strnlen(base_str, MAX_VMNAME);
+ ext_len = strnlen(ext, MAX_VMNAME);
+
+ if (base_len + ext_len > MAX_VMNAME) {
+ fprintf(stderr, "Filename exceeds maximum length.\n");
+ return (NULL);
+ }
+
+ res = malloc(base_len + ext_len + 1);
+ if (res == NULL) {
+ perror("Failed to allocate memory.");
+ return (NULL);
+ }
+
+ memcpy(res, base_str, base_len);
+ memcpy(res + base_len, ext, ext_len);
+ res[base_len + ext_len] = 0;
+
+ return (res);
+}
+
+void
+destroy_restore_state(struct restore_state *rstate)
+{
+ if (rstate == NULL) {
+ fprintf(stderr, "Attempting to destroy NULL restore struct.\n");
+ return;
+ }
+
+ if (rstate->kdata_map != MAP_FAILED)
+ munmap(rstate->kdata_map, rstate->kdata_len);
+
+ if (rstate->kdata_fd > 0)
+ close(rstate->kdata_fd);
+ if (rstate->vmmem_fd > 0)
+ close(rstate->vmmem_fd);
+
+ if (rstate->meta_root_obj != NULL)
+ ucl_object_unref(rstate->meta_root_obj);
+ if (rstate->meta_parser != NULL)
+ ucl_parser_free(rstate->meta_parser);
+}
+
+static int
+load_vmmem_file(const char *filename, struct restore_state *rstate)
+{
+ struct stat sb;
+ int err;
+
+ rstate->vmmem_fd = open(filename, O_RDONLY);
+ if (rstate->vmmem_fd < 0) {
+ perror("Failed to open restore file");
+ return (-1);
+ }
+
+ err = fstat(rstate->vmmem_fd, &sb);
+ if (err < 0) {
+ perror("Failed to stat restore file");
+ goto err_load_vmmem;
+ }
+
+ if (sb.st_size == 0) {
+ fprintf(stderr, "Restore file is empty.\n");
+ goto err_load_vmmem;
+ }
+
+ rstate->vmmem_len = sb.st_size;
+
+ return (0);
+
+err_load_vmmem:
+ if (rstate->vmmem_fd > 0)
+ close(rstate->vmmem_fd);
+ return (-1);
+}
+
+static int
+load_kdata_file(const char *filename, struct restore_state *rstate)
+{
+ struct stat sb;
+ int err;
+
+ rstate->kdata_fd = open(filename, O_RDONLY);
+ if (rstate->kdata_fd < 0) {
+ perror("Failed to open kernel data file");
+ return (-1);
+ }
+
+ err = fstat(rstate->kdata_fd, &sb);
+ if (err < 0) {
+ perror("Failed to stat kernel data file");
+ goto err_load_kdata;
+ }
+
+ if (sb.st_size == 0) {
+ fprintf(stderr, "Kernel data file is empty.\n");
+ goto err_load_kdata;
+ }
+
+ rstate->kdata_len = sb.st_size;
+ rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ,
+ MAP_SHARED, rstate->kdata_fd, 0);
+ if (rstate->kdata_map == MAP_FAILED) {
+ perror("Failed to map restore file");
+ goto err_load_kdata;
+ }
+
+ return (0);
+
+err_load_kdata:
+ if (rstate->kdata_fd > 0)
+ close(rstate->kdata_fd);
+ return (-1);
+}
+
+static int
+load_metadata_file(const char *filename, struct restore_state *rstate)
+{
+ const ucl_object_t *obj;
+ struct ucl_parser *parser;
+ int err;
+
+ parser = ucl_parser_new(UCL_PARSER_DEFAULT);
+ if (parser == NULL) {
+ fprintf(stderr, "Failed to initialize UCL parser.\n");
+ goto err_load_metadata;
+ }
+
+ err = ucl_parser_add_file(parser, filename);
+ if (err == 0) {
+ fprintf(stderr, "Failed to parse metadata file: '%s'\n",
+ filename);
+ err = -1;
+ goto err_load_metadata;
+ }
+
+ obj = ucl_parser_get_object(parser);
+ if (obj == NULL) {
+ fprintf(stderr, "Failed to parse object.\n");
+ err = -1;
+ goto err_load_metadata;
+ }
+
+ rstate->meta_parser = parser;
+ rstate->meta_root_obj = (ucl_object_t *)obj;
+
+ return (0);
+
+err_load_metadata:
+ if (parser != NULL)
+ ucl_parser_free(parser);
+ return (err);
+}
+
+int
+load_restore_file(const char *filename, struct restore_state *rstate)
+{
+ int err = 0;
+ char *kdata_filename = NULL, *meta_filename = NULL;
+
+ assert(filename != NULL);
+ assert(rstate != NULL);
+
+ memset(rstate, 0, sizeof(*rstate));
+ rstate->kdata_map = MAP_FAILED;
+
+ err = load_vmmem_file(filename, rstate);
+ if (err != 0) {
+ fprintf(stderr, "Failed to load guest RAM file.\n");
+ goto err_restore;
+ }
+
+ kdata_filename = strcat_extension(filename, ".kern");
+ if (kdata_filename == NULL) {
+ fprintf(stderr, "Failed to construct kernel data filename.\n");
+ goto err_restore;
+ }
+
+ err = load_kdata_file(kdata_filename, rstate);
+ if (err != 0) {
+ fprintf(stderr, "Failed to load guest kernel data file.\n");
+ goto err_restore;
+ }
+
+ meta_filename = strcat_extension(filename, ".meta");
+ if (meta_filename == NULL) {
+ fprintf(stderr, "Failed to construct kernel metadata filename.\n");
+ goto err_restore;
+ }
+
+ err = load_metadata_file(meta_filename, rstate);
+ if (err != 0) {
+ fprintf(stderr, "Failed to load guest metadata file.\n");
+ goto err_restore;
+ }
+
+ return (0);
+
+err_restore:
+ destroy_restore_state(rstate);
+ if (kdata_filename != NULL)
+ free(kdata_filename);
+ if (meta_filename != NULL)
+ free(meta_filename);
+ return (-1);
+}
+
+#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \
+do { \
+ const ucl_object_t *obj__; \
+ obj__ = ucl_object_lookup(obj, key); \
+ if (obj__ == NULL) { \
+ fprintf(stderr, "Missing key: '%s'", key); \
+ return (ret); \
+ } \
+ if (!ucl_object_toint_safe(obj__, result_ptr)) { \
+ fprintf(stderr, "Cannot convert '%s' value to int.", key); \
+ return (ret); \
+ } \
+} while(0)
+
+#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \
+do { \
+ const ucl_object_t *obj__; \
+ obj__ = ucl_object_lookup(obj, key); \
+ if (obj__ == NULL) { \
+ fprintf(stderr, "Missing key: '%s'", key); \
+ return (ret); \
+ } \
+ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \
+ fprintf(stderr, "Cannot convert '%s' value to string.", key); \
+ return (ret); \
+ } \
+} while(0)
+
+static void *
+lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate,
+ size_t *struct_size)
+{
+ const ucl_object_t *structs = NULL, *obj = NULL;
+ ucl_object_iter_t it = NULL;
+ int64_t snapshot_req, size, file_offset;
+
+ structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY);
+ if (structs == NULL) {
+ fprintf(stderr, "Failed to find '%s' object.\n",
+ JSON_STRUCT_ARR_KEY);
+ return (NULL);
+ }
+
+ if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) {
+ fprintf(stderr, "Object '%s' is not an array.\n",
+ JSON_STRUCT_ARR_KEY);
+ return (NULL);
+ }
+
+ while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) {
+ snapshot_req = -1;
+ JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
+ &snapshot_req, NULL);
+ assert(snapshot_req >= 0);
+ if ((enum snapshot_req) snapshot_req == struct_id) {
+ JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
+ &size, NULL);
+ assert(size >= 0);
+
+ JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
+ &file_offset, NULL);
+ assert(file_offset >= 0);
+ assert(file_offset + size <= rstate->kdata_len);
+
+ *struct_size = (size_t)size;
+ return (rstate->kdata_map + file_offset);
+ }
+ }
+
+ return (NULL);
+}
+
+static void *
+lookup_check_dev(const char *dev_name, struct restore_state *rstate,
+ const ucl_object_t *obj, size_t *data_size)
+{
+ const char *snapshot_req;
+ int64_t size, file_offset;
+
+ snapshot_req = NULL;
+ JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
+ &snapshot_req, NULL);
+ assert(snapshot_req != NULL);
+ if (!strcmp(snapshot_req, dev_name)) {
+ JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
+ &size, NULL);
+ assert(size >= 0);
+
+ JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
+ &file_offset, NULL);
+ assert(file_offset >= 0);
+ assert(file_offset + size <= rstate->kdata_len);
+
+ *data_size = (size_t)size;
+ return (rstate->kdata_map + file_offset);
+ }
+
+ return (NULL);
+}
+
+static void*
+lookup_dev(const char *dev_name, struct restore_state *rstate,
+ size_t *data_size)
+{
+ const ucl_object_t *devs = NULL, *obj = NULL;
+ ucl_object_iter_t it = NULL;
+ void *ret;
+
+ devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY);
+ if (devs == NULL) {
+ fprintf(stderr, "Failed to find '%s' object.\n",
+ JSON_DEV_ARR_KEY);
+ return (NULL);
+ }
+
+ if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) {
+ fprintf(stderr, "Object '%s' is not an array.\n",
+ JSON_DEV_ARR_KEY);
+ return (NULL);
+ }
+
+ while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) {
+ ret = lookup_check_dev(dev_name, rstate, obj, data_size);
+ if (ret != NULL)
+ return (ret);
+ }
+
+ return (NULL);
+}
+
+static const ucl_object_t *
+lookup_basic_metadata_object(struct restore_state *rstate)
+{
+ const ucl_object_t *basic_meta_obj = NULL;
+
+ basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj,
+ JSON_BASIC_METADATA_KEY);
+ if (basic_meta_obj == NULL) {
+ fprintf(stderr, "Failed to find '%s' object.\n",
+ JSON_BASIC_METADATA_KEY);
+ return (NULL);
+ }
+
+ if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) {
+ fprintf(stderr, "Object '%s' is not a JSON object.\n",
+ JSON_BASIC_METADATA_KEY);
+ return (NULL);
+ }
+
+ return (basic_meta_obj);
+}
+
+const char *
+lookup_vmname(struct restore_state *rstate)
+{
+ const char *vmname;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (NULL);
+
+ JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL);
+ return (vmname);
+}
+
+int
+lookup_memflags(struct restore_state *rstate)
+{
+ int64_t memflags;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (0);
+
+ JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0);
+
+ return ((int)memflags);
+}
+
+size_t
+lookup_memsize(struct restore_state *rstate)
+{
+ int64_t memsize;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (0);
+
+ JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0);
+ if (memsize < 0)
+ memsize = 0;
+
+ return ((size_t)memsize);
+}
+
+
+int
+lookup_guest_ncpus(struct restore_state *rstate)
+{
+ int64_t ncpus;
+ const ucl_object_t *obj;
+
+ obj = lookup_basic_metadata_object(rstate);
+ if (obj == NULL)
+ return (0);
+
+ JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0);
+ return ((int)ncpus);
+}
+
+static void
+winch_handler(int signal)
+{
+#ifdef TIOCGWINSZ
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
+#endif /* TIOCGWINSZ */
+}
+
+static int
+print_progress(size_t crtval, const size_t maxval)
+{
+ size_t rc;
+ double crtval_gb, maxval_gb;
+ size_t i, win_width, prog_start, prog_done, prog_end;
+ int mval_len;
+
+ static char prog_buf[PROG_BUF_SZ];
+ static const size_t len = sizeof(prog_buf);
+
+ static size_t div;
+ static char *div_str;
+
+ static char wip_bar[] = { '/', '-', '\\', '|' };
+ static int wip_idx = 0;
+
+ if (maxval == 0) {
+ printf("[0B / 0B]\r\n");
+ return (0);
+ }
+
+ if (crtval > maxval)
+ crtval = maxval;
+
+ if (maxval > 10 * GB) {
+ div = GB;
+ div_str = "GiB";
+ } else if (maxval > 10 * MB) {
+ div = MB;
+ div_str = "MiB";
+ } else {
+ div = KB;
+ div_str = "KiB";
+ }
+
+ crtval_gb = (double) crtval / div;
+ maxval_gb = (double) maxval / div;
+
+ rc = snprintf(prog_buf, len, "%.03lf", maxval_gb);
+ if (rc == len) {
+ fprintf(stderr, "Maxval too big\n");
+ return (-1);
+ }
+ mval_len = rc;
+
+ rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |",
+ mval_len, crtval_gb, div_str, maxval_gb, div_str);
+
+ if (rc == len) {
+ fprintf(stderr, "Buffer too small to print progress\n");
+ return (-1);
+ }
+
+ win_width = min(winsize.ws_col, len);
+ prog_start = rc;
+
+ if (prog_start < (win_width - 2)) {
+ prog_end = win_width - prog_start - 2;
+ prog_done = prog_end * (crtval_gb / maxval_gb);
+
+ for (i = prog_start; i < prog_start + prog_done; i++)
+ prog_buf[i] = '#';
+
+ if (crtval != maxval) {
+ prog_buf[i] = wip_bar[wip_idx];
+ wip_idx = (wip_idx + 1) % sizeof(wip_bar);
+ i++;
+ } else {
+ prog_buf[i++] = '#';
+ }
+
+ for (; i < win_width - 2; i++)
+ prog_buf[i] = '_';
+
+ prog_buf[win_width - 2] = '|';
+ }
+
+ prog_buf[win_width - 1] = '\0';
+ write(STDOUT_FILENO, prog_buf, win_width);
+
+ return (0);
+}
+
+static void *
+snapshot_spinner_cb(void *arg)
+{
+ int rc;
+ size_t crtval, maxval, total;
+ struct spinner_info *si;
+ struct timespec ts;
+
+ si = arg;
+ if (si == NULL)
+ pthread_exit(NULL);
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */
+
+ do {
+ crtval = *si->crtval;
+ maxval = si->maxval;
+ total = si->total;
+
+ rc = print_progress(crtval, total);
+ if (rc < 0) {
+ fprintf(stderr, "Failed to parse progress\n");
+ break;
+ }
+
+ nanosleep(&ts, NULL);
+ } while (crtval < maxval);
+
+ pthread_exit(NULL);
+ return NULL;
+}
+
+static int
+vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src,
+ const size_t len, const size_t totalmem, const bool op_wr)
+{
+ int rc;
+ size_t part_done, todo, rem;
+ ssize_t done;
+ bool show_progress;
+ pthread_t spinner_th;
+ struct spinner_info *si;
+
+ if (lseek(snapfd, foff, SEEK_SET) < 0) {
+ perror("Failed to change file offset");
+ return (-1);
+ }
+
+ show_progress = false;
+ if (isatty(STDIN_FILENO) && (winsize.ws_col != 0))
+ show_progress = true;
+
+ part_done = foff;
+ rem = len;
+
+ if (show_progress) {
+ si = &(struct spinner_info) {
+ .crtval = &part_done,
+ .maxval = foff + len,
+ .total = totalmem
+ };
+
+ rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si);
+ if (rc) {
+ perror("Unable to create spinner thread");
+ show_progress = false;
+ }
+ }
+
+ while (rem > 0) {
+ if (show_progress)
+ todo = min(SNAPSHOT_CHUNK, rem);
+ else
+ todo = rem;
+
+ if (op_wr)
+ done = write(snapfd, src, todo);
+ else
+ done = read(snapfd, src, todo);
+ if (done < 0) {
+ perror("Failed to write in file");
+ return (-1);
+ }
+
+ src += done;
+ part_done += done;
+ rem -= done;
+ }
+
+ if (show_progress) {
+ rc = pthread_join(spinner_th, NULL);
+ if (rc)
+ perror("Unable to end spinner thread");
+ }
+
+ return (0);
+}
+
+static size_t
+vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr)
+{
+ int ret;
+ size_t lowmem, highmem, totalmem;
+ char *baseaddr;
+
+ ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem);
+ if (ret) {
+ fprintf(stderr, "%s: unable to retrieve guest memory size\r\n",
+ __func__);
+ return (0);
+ }
+ totalmem = lowmem + highmem;
+
+ if ((op_wr == false) && (totalmem != memsz)) {
+ fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n",
+ __func__, totalmem, memsz);
+ return (0);
+ }
+
+ winsize.ws_col = 80;
+#ifdef TIOCGWINSZ
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
+#endif /* TIOCGWINSZ */
+ old_winch_handler = signal(SIGWINCH, winch_handler);
+
+ ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem,
+ totalmem, op_wr);
+ if (ret) {
+ fprintf(stderr, "%s: Could not %s lowmem\r\n",
+ __func__, op_wr ? "write" : "read");
+ totalmem = 0;
+ goto done;
+ }
+
+ if (highmem == 0)
+ goto done;
+
+ ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB,
+ highmem, totalmem, op_wr);
+ if (ret) {
+ fprintf(stderr, "%s: Could not %s highmem\r\n",
+ __func__, op_wr ? "write" : "read");
+ totalmem = 0;
+ goto done;
+ }
+
+done:
+ printf("\r\n");
+ signal(SIGWINCH, old_winch_handler);
+
+ return (totalmem);
+}
+
+int
+restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate)
+{
+ size_t restored;
+
+ restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len,
+ false);
+
+ if (restored != rstate->vmmem_len)
+ return (-1);
+
+ return (0);
+}
+
+static int
+vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate,
+ const struct vm_snapshot_kern_info *info)
+{
+ void *struct_ptr;
+ size_t struct_size;
+ int ret;
+ struct vm_snapshot_meta *meta;
+
+ struct_ptr = lookup_struct(info->req, rstate, &struct_size);
+ if (struct_ptr == NULL) {
+ fprintf(stderr, "%s: Failed to lookup struct %s\r\n",
+ __func__, info->struct_name);
+ ret = -1;
+ goto done;
+ }
+
+ if (struct_size == 0) {
+ fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n",
+ __func__, info->struct_name);
+ ret = -1;
+ goto done;
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+ .dev_name = info->struct_name,
+ .dev_req = info->req,
+
+ .buffer.buf_start = struct_ptr,
+ .buffer.buf_size = struct_size,
+
+ .buffer.buf = struct_ptr,
+ .buffer.buf_rem = struct_size,
+
+ .op = VM_SNAPSHOT_RESTORE,
+ };
+
+ ret = vm_snapshot_req(meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: Failed to restore struct: %s\r\n",
+ __func__, info->struct_name);
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_kern_structs); i++) {
+ ret = vm_restore_kern_struct(ctx, rstate,
+ &snapshot_kern_structs[i]);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate,
+ const struct vm_snapshot_dev_info *info)
+{
+ void *dev_ptr;
+ size_t dev_size;
+ int ret;
+ struct vm_snapshot_meta *meta;
+
+ dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size);
+ if (dev_ptr == NULL) {
+ fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name);
+ fprintf(stderr, "Continuing the restore/migration process\r\n");
+ return (0);
+ }
+
+ if (dev_size == 0) {
+ fprintf(stderr, "%s: Device size is 0. "
+ "Assuming %s is not used\r\n",
+ __func__, info->dev_name);
+ return (0);
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+ .dev_name = info->dev_name,
+
+ .buffer.buf_start = dev_ptr,
+ .buffer.buf_size = dev_size,
+
+ .buffer.buf = dev_ptr,
+ .buffer.buf_rem = dev_size,
+
+ .op = VM_SNAPSHOT_RESTORE,
+ };
+
+ ret = (*info->snapshot_cb)(meta);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to restore dev: %s\r\n",
+ info->dev_name);
+ return (-1);
+ }
+
+ return (0);
+}
+
+
+int
+vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return 0;
+}
+
+int
+vm_pause_user_devs(struct vmctx *ctx)
+{
+ const struct vm_snapshot_dev_info *info;
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ info = &snapshot_devs[i];
+ if (info->pause_cb == NULL)
+ continue;
+
+ ret = info->pause_cb(ctx, info->dev_name);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+vm_resume_user_devs(struct vmctx *ctx)
+{
+ const struct vm_snapshot_dev_info *info;
+ int ret;
+ int i;
+
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ info = &snapshot_devs[i];
+ if (info->resume_cb == NULL)
+ continue;
+
+ ret = info->resume_cb(ctx, info->dev_name);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key,
+ struct vm_snapshot_meta *meta, off_t *offset)
+{
+ int ret;
+ size_t data_size;
+ ssize_t write_cnt;
+
+ ret = vm_snapshot_req(meta);
+ if (ret != 0) {
+ fprintf(stderr, "%s: Failed to snapshot struct %s\r\n",
+ __func__, meta->dev_name);
+ ret = -1;
+ goto done;
+ }
+
+ data_size = vm_get_snapshot_size(meta);
+
+ write_cnt = write(data_fd, meta->buffer.buf_start, data_size);
+ if (write_cnt != data_size) {
+ perror("Failed to write all snapshotted data.");
+ ret = -1;
+ goto done;
+ }
+
+ /* Write metadata. */
+ xo_open_instance_h(xop, array_key);
+ xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name);
+ xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n",
+ meta->dev_req);
+ xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
+ xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
+ xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY);
+
+ *offset += data_size;
+
+done:
+ return (ret);
+}
+
+static int
+vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+{
+ int ret, i, error;
+ size_t offset, buf_size;
+ char *buffer;
+ struct vm_snapshot_meta *meta;
+
+ error = 0;
+ offset = 0;
+ buf_size = SNAPSHOT_BUFFER_SIZE;
+
+ buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char));
+ if (buffer == NULL) {
+ error = ENOMEM;
+ perror("Failed to allocate memory for snapshot buffer");
+ goto err_vm_snapshot_kern_data;
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+
+ .buffer.buf_start = buffer,
+ .buffer.buf_size = buf_size,
+
+ .op = VM_SNAPSHOT_SAVE,
+ };
+
+ xo_open_list_h(xop, JSON_STRUCT_ARR_KEY);
+ for (i = 0; i < nitems(snapshot_kern_structs); i++) {
+ meta->dev_name = snapshot_kern_structs[i].struct_name;
+ meta->dev_req = snapshot_kern_structs[i].req;
+
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY,
+ meta, &offset);
+ if (ret != 0) {
+ error = -1;
+ goto err_vm_snapshot_kern_data;
+ }
+ }
+ xo_close_list_h(xop, JSON_STRUCT_ARR_KEY);
+
+err_vm_snapshot_kern_data:
+ if (buffer != NULL)
+ free(buffer);
+ return (error);
+}
+
+static int
+vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz)
+{
+ int error;
+ int memflags;
+ char vmname_buf[MAX_VMNAME];
+
+ memset(vmname_buf, 0, MAX_VMNAME);
+ error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+ if (error != 0) {
+ perror("Failed to get VM name");
+ goto err;
+ }
+
+ memflags = vm_get_memflags(ctx);
+
+ xo_open_container_h(xop, JSON_BASIC_METADATA_KEY);
+ xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus);
+ xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf);
+ xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz);
+ xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags);
+ xo_close_container_h(xop, JSON_BASIC_METADATA_KEY);
+
+err:
+ return (error);
+}
+
+static int
+vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key,
+ struct vm_snapshot_meta *meta, off_t *offset)
+{
+ int ret;
+ size_t data_size;
+
+ data_size = vm_get_snapshot_size(meta);
+
+ ret = write(data_fd, meta->buffer.buf_start, data_size);
+ if (ret != data_size) {
+ perror("Failed to write all snapshotted data.");
+ return (-1);
+ }
+
+ /* Write metadata. */
+ xo_open_instance_h(xop, array_key);
+ xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name);
+ xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
+ xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
+ xo_close_instance_h(xop, array_key);
+
+ *offset += data_size;
+
+ return (0);
+}
+
+static int
+vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info,
+ int data_fd, xo_handle_t *xop,
+ struct vm_snapshot_meta *meta, off_t *offset)
+{
+ int ret;
+
+ ret = (*info->snapshot_cb)(meta);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n",
+ meta->dev_name, ret);
+ return (ret);
+ }
+
+ ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta,
+ offset);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+}
+
+static int
+vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+{
+ int ret, i;
+ off_t offset;
+ void *buffer;
+ size_t buf_size;
+ struct vm_snapshot_meta *meta;
+
+ buf_size = SNAPSHOT_BUFFER_SIZE;
+
+ offset = lseek(data_fd, 0, SEEK_CUR);
+ if (offset < 0) {
+ perror("Failed to get data file current offset.");
+ return (-1);
+ }
+
+ buffer = malloc(buf_size);
+ if (buffer == NULL) {
+ perror("Failed to allocate memory for snapshot buffer");
+ ret = ENOSPC;
+ goto snapshot_err;
+ }
+
+ meta = &(struct vm_snapshot_meta) {
+ .ctx = ctx,
+
+ .buffer.buf_start = buffer,
+ .buffer.buf_size = buf_size,
+
+ .op = VM_SNAPSHOT_SAVE,
+ };
+
+ xo_open_list_h(xop, JSON_DEV_ARR_KEY);
+
+ /* Restore other devices that support this feature */
+ for (i = 0; i < nitems(snapshot_devs); i++) {
+ meta->dev_name = snapshot_devs[i].dev_name;
+
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop,
+ meta, &offset);
+ if (ret != 0)
+ goto snapshot_err;
+ }
+
+ xo_close_list_h(xop, JSON_DEV_ARR_KEY);
+
+snapshot_err:
+ if (buffer != NULL)
+ free(buffer);
+ return (ret);
+}
+
+void
+checkpoint_cpu_add(int vcpu)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ CPU_SET(vcpu, &vcpus_active);
+
+ if (checkpoint_active) {
+ CPU_SET(vcpu, &vcpus_suspended);
+ while (checkpoint_active)
+ pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
+ CPU_CLR(vcpu, &vcpus_suspended);
+ }
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+/*
+ * When a vCPU is suspended for any reason, it calls
+ * checkpoint_cpu_suspend(). This records that the vCPU is idle.
+ * Before returning from suspension, checkpoint_cpu_resume() is
+ * called. In suspend we note that the vCPU is idle. In resume we
+ * pause the vCPU thread until the checkpoint is complete. The reason
+ * for the two-step process is that vCPUs might already be stopped in
+ * the debug server when a checkpoint is requested. This approach
+ * allows us to account for and handle those vCPUs.
+ */
+void
+checkpoint_cpu_suspend(int vcpu)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ CPU_SET(vcpu, &vcpus_suspended);
+ if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0)
+ pthread_cond_signal(&vcpus_idle);
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+void
+checkpoint_cpu_resume(int vcpu)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ while (checkpoint_active)
+ pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
+ CPU_CLR(vcpu, &vcpus_suspended);
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+static void
+vm_vcpu_pause(struct vmctx *ctx)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ checkpoint_active = true;
+ vm_suspend_cpu(ctx, -1);
+ while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0)
+ pthread_cond_wait(&vcpus_idle, &vcpu_lock);
+ pthread_mutex_unlock(&vcpu_lock);
+}
+
+static void
+vm_vcpu_resume(struct vmctx *ctx)
+{
+
+ pthread_mutex_lock(&vcpu_lock);
+ checkpoint_active = false;
+ pthread_mutex_unlock(&vcpu_lock);
+ vm_resume_cpu(ctx, -1);
+ pthread_cond_broadcast(&vcpus_can_run);
+}
+
+static int
+vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm)
+{
+ int fd_checkpoint = 0, kdata_fd = 0;
+ int ret = 0;
+ int error = 0;
+ size_t memsz;
+ xo_handle_t *xop = NULL;
+ char *meta_filename = NULL;
+ char *kdata_filename = NULL;
+ FILE *meta_file = NULL;
+
+ kdata_filename = strcat_extension(checkpoint_file, ".kern");
+ if (kdata_filename == NULL) {
+ fprintf(stderr, "Failed to construct kernel data filename.\n");
+ return (-1);
+ }
+
+ kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700);
+ if (kdata_fd < 0) {
+ perror("Failed to open kernel data snapshot file.");
+ error = -1;
+ goto done;
+ }
+
+ fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700);
+
+ if (fd_checkpoint < 0) {
+ perror("Failed to create checkpoint file");
+ error = -1;
+ goto done;
+ }
+
+ meta_filename = strcat_extension(checkpoint_file, ".meta");
+ if (meta_filename == NULL) {
+ fprintf(stderr, "Failed to construct vm metadata filename.\n");
+ goto done;
+ }
+
+ meta_file = fopen(meta_filename, "w");
+ if (meta_file == NULL) {
+ perror("Failed to open vm metadata snapshot file.");
+ goto done;
+ }
+
+ xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY);
+ if (xop == NULL) {
+ perror("Failed to get libxo handle on metadata file.");
+ goto done;
+ }
+
+ vm_vcpu_pause(ctx);
+
+ ret = vm_pause_user_devs(ctx);
+ if (ret != 0) {
+ fprintf(stderr, "Could not pause devices\r\n");
+ error = ret;
+ goto done;
+ }
+
+ memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true);
+ if (memsz == 0) {
+ perror("Could not write guest memory to file");
+ error = -1;
+ goto done;
+ }
+
+ ret = vm_snapshot_basic_metadata(ctx, xop, memsz);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot vm basic metadata.\n");
+ error = -1;
+ goto done;
+ }
+
+
+ ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot vm kernel data.\n");
+ error = -1;
+ goto done;
+ }
+
+ ret = vm_snapshot_user_devs(ctx, kdata_fd, xop);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to snapshot device state.\n");
+ error = -1;
+ goto done;
+ }
+
+ xo_finish_h(xop);
+
+ if (stop_vm) {
+ vm_destroy(ctx);
+ exit(0);
+ }
+
+done:
+ ret = vm_resume_user_devs(ctx);
+ if (ret != 0)
+ fprintf(stderr, "Could not resume devices\r\n");
+ vm_vcpu_resume(ctx);
+ if (fd_checkpoint > 0)
+ close(fd_checkpoint);
+ if (meta_filename != NULL)
+ free(meta_filename);
+ if (kdata_filename != NULL)
+ free(kdata_filename);
+ if (xop != NULL)
+ xo_destroy(xop);
+ if (meta_file != NULL)
+ fclose(meta_file);
+ if (kdata_fd > 0)
+ close(kdata_fd);
+ return (error);
+}
+
+int
+get_checkpoint_msg(int conn_fd, struct vmctx *ctx)
+{
+ unsigned char buf[MAX_MSG_SIZE];
+ struct checkpoint_op *checkpoint_op;
+ int len, recv_len, total_recv = 0;
+ int err = 0;
+
+ len = sizeof(struct checkpoint_op); /* expected length */
+ while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) {
+ total_recv += recv_len;
+ }
+ if (recv_len < 0) {
+ perror("Error while receiving data from bhyvectl");
+ err = -1;
+ goto done;
+ }
+
+ checkpoint_op = (struct checkpoint_op *)buf;
+ switch (checkpoint_op->op) {
+ case START_CHECKPOINT:
+ err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false);
+ break;
+ case START_SUSPEND:
+ err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true);
+ break;
+ default:
+ fprintf(stderr, "Unrecognized checkpoint operation.\n");
+ err = -1;
+ }
+
+done:
+ close(conn_fd);
+ return (err);
+}
+
+/*
+ * Listen for commands from bhyvectl
+ */
+void *
+checkpoint_thread(void *param)
+{
+ struct checkpoint_thread_info *thread_info;
+ int conn_fd, ret;
+
+ pthread_set_name_np(pthread_self(), "checkpoint thread");
+ thread_info = (struct checkpoint_thread_info *)param;
+
+ while ((conn_fd = accept(thread_info->socket_fd, NULL, NULL)) > -1) {
+ ret = get_checkpoint_msg(conn_fd, thread_info->ctx);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to read message on checkpoint "
+ "socket. Retrying.\n");
+ }
+ }
+ if (conn_fd < -1) {
+ perror("Failed to accept connection");
+ }
+
+ return (NULL);
+}
+
+/*
+ * Create directory tree to store runtime specific information:
+ * i.e. UNIX sockets for IPC with bhyvectl.
+ */
+static int
+make_checkpoint_dir(void)
+{
+ int err;
+
+ err = mkdir(BHYVE_RUN_DIR, 0755);
+ if (err < 0 && errno != EEXIST)
+ return (err);
+
+ err = mkdir(CHECKPOINT_RUN_DIR, 0755);
+ if (err < 0 && errno != EEXIST)
+ return (err);
+
+ return 0;
+}
+
+/*
+ * Create the listening socket for IPC with bhyvectl
+ */
+int
+init_checkpoint_thread(struct vmctx *ctx)
+{
+ struct checkpoint_thread_info *checkpoint_info = NULL;
+ struct sockaddr_un addr;
+ int socket_fd;
+ pthread_t checkpoint_pthread;
+ char vmname_buf[MAX_VMNAME];
+ int ret, err = 0;
+
+ memset(&addr, 0, sizeof(addr));
+
+ err = pthread_mutex_init(&vcpu_lock, NULL);
+ if (err != 0)
+ errc(1, err, "checkpoint mutex init");
+ err = pthread_cond_init(&vcpus_idle, NULL);
+ if (err == 0)
+ err = pthread_cond_init(&vcpus_can_run, NULL);
+ if (err != 0)
+ errc(1, err, "checkpoint cv init");
+
+ socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (socket_fd < 0) {
+ perror("Socket creation failed (IPC with bhyvectl");
+ err = -1;
+ goto fail;
+ }
+
+ err = make_checkpoint_dir();
+ if (err < 0) {
+ perror("Failed to create checkpoint runtime directory");
+ goto fail;
+ }
+
+ addr.sun_family = AF_UNIX;
+
+ err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+ if (err != 0) {
+ perror("Failed to get VM name");
+ goto fail;
+ }
+
+ snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s",
+ CHECKPOINT_RUN_DIR, vmname_buf);
+ addr.sun_len = SUN_LEN(&addr);
+ unlink(addr.sun_path);
+
+ if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) {
+ perror("Failed to bind socket (IPC with bhyvectl)");
+ err = -1;
+ goto fail;
+ }
+
+ if (listen(socket_fd, 10) < 0) {
+ perror("Failed to listen on socket (IPC with bhyvectl)");
+ err = -1;
+ goto fail;
+ }
+
+ checkpoint_info = calloc(1, sizeof(*checkpoint_info));
+ checkpoint_info->ctx = ctx;
+ checkpoint_info->socket_fd = socket_fd;
+
+ ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread,
+ checkpoint_info);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+
+ return (0);
+fail:
+ free(checkpoint_info);
+ if (socket_fd > 0)
+ close(socket_fd);
+ unlink(addr.sun_path);
+
+ return (err);
+}
+
+void
+vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op)
+{
+ const char *__op;
+
+ if (op == VM_SNAPSHOT_SAVE)
+ __op = "save";
+ else if (op == VM_SNAPSHOT_RESTORE)
+ __op = "restore";
+ else
+ __op = "unknown";
+
+ fprintf(stderr, "%s: snapshot-%s failed for %s\r\n",
+ __func__, __op, bufname);
+}
+
+int
+vm_snapshot_buf(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta)
+{
+ struct vm_snapshot_buffer *buffer;
+ int op;
+
+ buffer = &meta->buffer;
+ op = meta->op;
+
+ if (buffer->buf_rem < data_size) {
+ fprintf(stderr, "%s: buffer too small\r\n", __func__);
+ return (E2BIG);
+ }
+
+ if (op == VM_SNAPSHOT_SAVE)
+ memcpy(buffer->buf, (uint8_t *) data, data_size);
+ else if (op == VM_SNAPSHOT_RESTORE)
+ memcpy((uint8_t *) data, buffer->buf, data_size);
+ else
+ return (EINVAL);
+
+ buffer->buf += data_size;
+ buffer->buf_rem -= data_size;
+
+ return (0);
+}
+
+size_t
+vm_get_snapshot_size(struct vm_snapshot_meta *meta)
+{
+ size_t length;
+ struct vm_snapshot_buffer *buffer;
+
+ buffer = &meta->buffer;
+
+ if (buffer->buf_size < buffer->buf_rem) {
+ fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n",
+ __func__, buffer->buf_size, buffer->buf_rem);
+ length = 0;
+ } else {
+ length = buffer->buf_size - buffer->buf_rem;
+ }
+
+ return (length);
+}
+
+int
+vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null,
+ struct vm_snapshot_meta *meta)
+{
+ int ret;
+ vm_paddr_t gaddr;
+
+ if (meta->op == VM_SNAPSHOT_SAVE) {
+ gaddr = paddr_host2guest(meta->ctx, *addrp);
+ if (gaddr == (vm_paddr_t) -1) {
+ if (!restore_null ||
+ (restore_null && (*addrp != NULL))) {
+ ret = EFAULT;
+ goto done;
+ }
+ }
+
+ SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
+ } else if (meta->op == VM_SNAPSHOT_RESTORE) {
+ SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
+ if (gaddr == (vm_paddr_t) -1) {
+ if (!restore_null) {
+ ret = EFAULT;
+ goto done;
+ }
+ }
+
+ *addrp = paddr_guest2host(meta->ctx, gaddr, len);
+ } else {
+ ret = EINVAL;
+ }
+
+done:
+ return (ret);
+}
+
+int
+vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+ struct vm_snapshot_meta *meta)
+{
+ struct vm_snapshot_buffer *buffer;
+ int op;
+ int ret;
+
+ buffer = &meta->buffer;
+ op = meta->op;
+
+ if (buffer->buf_rem < data_size) {
+ fprintf(stderr, "%s: buffer too small\r\n", __func__);
+ ret = E2BIG;
+ goto done;
+ }
+
+ if (op == VM_SNAPSHOT_SAVE) {
+ ret = 0;
+ memcpy(buffer->buf, (uint8_t *) data, data_size);
+ } else if (op == VM_SNAPSHOT_RESTORE) {
+ ret = memcmp((uint8_t *) data, buffer->buf, data_size);
+ } else {
+ ret = EINVAL;
+ goto done;
+ }
+
+ buffer->buf += data_size;
+ buffer->buf_rem -= data_size;
+
+done:
+ return (ret);
+}
diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h
new file mode 100644
index 000000000000..f9ea3d573089
--- /dev/null
+++ b/usr.sbin/bhyve/snapshot.h
@@ -0,0 +1,105 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVE_SNAPSHOT_
+#define _BHYVE_SNAPSHOT_
+
+#include <machine/vmm_snapshot.h>
+#include <libxo/xo.h>
+#include <ucl.h>
+
+struct vmctx;
+
+struct restore_state {
+ int kdata_fd;
+ int vmmem_fd;
+
+ void *kdata_map;
+ size_t kdata_len;
+
+ size_t vmmem_len;
+
+ struct ucl_parser *meta_parser;
+ ucl_object_t *meta_root_obj;
+};
+
+struct checkpoint_thread_info {
+ struct vmctx *ctx;
+ int socket_fd;
+};
+
+typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *);
+typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *);
+typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *);
+
+struct vm_snapshot_dev_info {
+ const char *dev_name; /* device name */
+ vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */
+ vm_pause_dev_cb pause_cb; /* callback for device pause */
+ vm_resume_dev_cb resume_cb; /* callback for device resume */
+};
+
+struct vm_snapshot_kern_info {
+ const char *struct_name; /* kernel structure name*/
+ enum snapshot_req req; /* request type */
+};
+
+void destroy_restore_state(struct restore_state *rstate);
+
+const char *lookup_vmname(struct restore_state *rstate);
+int lookup_memflags(struct restore_state *rstate);
+size_t lookup_memsize(struct restore_state *rstate);
+int lookup_guest_ncpus(struct restore_state *rstate);
+
+void checkpoint_cpu_add(int vcpu);
+void checkpoint_cpu_resume(int vcpu);
+void checkpoint_cpu_suspend(int vcpu);
+
+int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate);
+int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate);
+
+int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate);
+int vm_pause_user_devs(struct vmctx *ctx);
+int vm_resume_user_devs(struct vmctx *ctx);
+
+int get_checkpoint_msg(int conn_fd, struct vmctx *ctx);
+void *checkpoint_thread(void *param);
+int init_checkpoint_thread(struct vmctx *ctx);
+
+int load_restore_file(const char *filename, struct restore_state *rstate);
+
+#endif
diff --git a/usr.sbin/bhyve/uart_emul.c b/usr.sbin/bhyve/uart_emul.c
index 930344a52935..a89974590a1f 100644
--- a/usr.sbin/bhyve/uart_emul.c
+++ b/usr.sbin/bhyve/uart_emul.c
@@ -39,6 +39,8 @@ __FBSDID("$FreeBSD$");
#include <capsicum_helpers.h>
#endif
+#include <machine/vmm_snapshot.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
@@ -719,3 +721,35 @@ uart_set_backend(struct uart_softc *sc, const char *opts)
return (retval);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf),
+ meta, ret, done);
+
+ sc->thre_int_pending = 1;
+
+done:
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/uart_emul.h b/usr.sbin/bhyve/uart_emul.h
index a87202df1f96..5a53294da89e 100644
--- a/usr.sbin/bhyve/uart_emul.h
+++ b/usr.sbin/bhyve/uart_emul.h
@@ -31,10 +31,10 @@
#ifndef _UART_EMUL_H_
#define _UART_EMUL_H_
-
#define UART_IO_BAR_SIZE 8
struct uart_softc;
+struct vm_snapshot_meta;
typedef void (*uart_intr_func_t)(void *arg);
struct uart_softc *uart_init(uart_intr_func_t intr_assert,
@@ -44,4 +44,7 @@ int uart_legacy_alloc(int unit, int *ioaddr, int *irq);
uint8_t uart_read(struct uart_softc *sc, int offset);
void uart_write(struct uart_softc *sc, int offset, uint8_t value);
int uart_set_backend(struct uart_softc *sc, const char *opt);
+#ifdef BHYVE_SNAPSHOT
+int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta);
+#endif
#endif
diff --git a/usr.sbin/bhyve/usb_emul.h b/usr.sbin/bhyve/usb_emul.h
index c52411dd0650..d6e1e616cd82 100644
--- a/usr.sbin/bhyve/usb_emul.h
+++ b/usr.sbin/bhyve/usb_emul.h
@@ -41,10 +41,10 @@
#define USB_XFER_IN 1
-
struct usb_hci;
struct usb_device_request;
struct usb_data_xfer;
+struct vm_snapshot_meta;
/* Device emulation handlers */
struct usb_devemu {
@@ -62,6 +62,7 @@ struct usb_devemu {
int (*ue_reset)(void *sc);
int (*ue_remove)(void *sc);
int (*ue_stop)(void *sc);
+ int (*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta);
};
#define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x);
@@ -148,7 +149,6 @@ enum USB_ERRCODE {
pthread_mutex_unlock(&((x)->mtx)); \
} while (0)
-
struct usb_devemu *usb_emu_finddev(char *name);
struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer,
diff --git a/usr.sbin/bhyve/usb_mouse.c b/usr.sbin/bhyve/usb_mouse.c
index da3800c11fc2..5398da818c7f 100644
--- a/usr.sbin/bhyve/usb_mouse.c
+++ b/usr.sbin/bhyve/usb_mouse.c
@@ -31,6 +31,8 @@ __FBSDID("$FreeBSD$");
#include <sys/time.h>
+#include <machine/vmm_snapshot.h>
+
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
@@ -788,6 +790,29 @@ umouse_stop(void *scarg)
return (0);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct umouse_softc *sc;
+
+ sc = scarg;
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done);
+
+done:
+ return (ret);
+}
+#endif
struct usb_devemu ue_mouse = {
.ue_emu = "tablet",
@@ -798,6 +823,9 @@ struct usb_devemu ue_mouse = {
.ue_data = umouse_data_handler,
.ue_reset = umouse_reset,
.ue_remove = umouse_remove,
- .ue_stop = umouse_stop
+ .ue_stop = umouse_stop,
+#ifdef BHYVE_SNAPSHOT
+ .ue_snapshot = umouse_snapshot,
+#endif
};
USB_EMUL_SET(ue_mouse);
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index d899a5779570..f3deb72b081c 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/uio.h>
#include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
#include <stdio.h>
#include <stdint.h>
@@ -806,3 +807,150 @@ done:
if (vs->vs_mtx)
pthread_mutex_unlock(vs->vs_mtx);
}
+
+#ifdef BHYVE_SNAPSHOT
+int
+vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct virtio_softc *vs;
+ struct virtio_consts *vc;
+
+ vs = pi->pi_arg;
+ vc = vs->vs_vc;
+
+ vc = vs->vs_vc;
+ assert(vc->vc_pause != NULL);
+ (*vc->vc_pause)(DEV_SOFTC(vs));
+
+ return (0);
+}
+
+int
+vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi)
+{
+ struct virtio_softc *vs;
+ struct virtio_consts *vc;
+
+ vs = pi->pi_arg;
+ vc = vs->vs_vc;
+
+ vc = vs->vs_vc;
+ assert(vc->vc_resume != NULL);
+ (*vc->vc_resume)(DEV_SOFTC(vs));
+
+ return (0);
+}
+
+static int
+vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta)
+{
+ int ret;
+
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done);
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done);
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done);
+
+done:
+ return (ret);
+}
+
+static int
+vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
+{
+ int i;
+ int ret;
+ struct virtio_consts *vc;
+ struct vqueue_info *vq;
+ uint64_t addr_size;
+
+ vc = vs->vs_vc;
+
+ /* Save virtio queue info */
+ for (i = 0; i < vc->vc_nvq; i++) {
+ vq = &vs->vs_queues[i];
+
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done);
+ SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done);
+
+ addr_size = vq->vq_qsize * sizeof(struct virtio_desc);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size,
+ false, meta, ret, done);
+
+ addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size,
+ false, meta, ret, done);
+
+ addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t);
+ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size,
+ false, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size(vq->vq_qsize),
+ meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+int
+vi_pci_snapshot(struct vm_snapshot_meta *meta)
+{
+ int ret;
+ struct pci_devinst *pi;
+ struct virtio_softc *vs;
+ struct virtio_consts *vc;
+
+ pi = meta->dev_data;
+ vs = pi->pi_arg;
+ vc = vs->vs_vc;
+
+ /* Save virtio softc */
+ ret = vi_pci_snapshot_softc(vs, meta);
+ if (ret != 0)
+ goto done;
+
+ /* Save virtio consts */
+ ret = vi_pci_snapshot_consts(vc, meta);
+ if (ret != 0)
+ goto done;
+
+ /* Save virtio queue info */
+ ret = vi_pci_snapshot_queues(vs, meta);
+ if (ret != 0)
+ goto done;
+
+ /* Save device softc, if needed */
+ if (vc->vc_snapshot != NULL) {
+ ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta);
+ if (ret != 0)
+ goto done;
+ }
+
+done:
+ return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index ab95d9d213e3..e9432e012b27 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -287,6 +287,7 @@ vring_size(u_int qsz)
struct vmctx;
struct pci_devinst;
struct vqueue_info;
+struct vm_snapshot_meta;
/*
* A virtual device, with some number (possibly 0) of virtual
@@ -361,6 +362,10 @@ struct virtio_consts {
void (*vc_apply_features)(void *, uint64_t);
/* called to apply negotiated features */
uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
+ void (*vc_pause)(void *); /* called to pause device activity */
+ void (*vc_resume)(void *); /* called to resume device activity */
+ int (*vc_snapshot)(void *, struct vm_snapshot_meta *);
+ /* called to save / restore device state */
};
/*
@@ -491,4 +496,9 @@ uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size);
void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value);
+#ifdef BHYVE_SNAPSHOT
+int vi_pci_snapshot(struct vm_snapshot_meta *meta);
+int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi);
+int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi);
+#endif
#endif /* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile
index 0ffca5675cb9..58eaf49dae3a 100644
--- a/usr.sbin/bhyvectl/Makefile
+++ b/usr.sbin/bhyvectl/Makefile
@@ -2,6 +2,8 @@
# $FreeBSD$
#
+.include <src.opts.mk>
+
PROG= bhyvectl
SRCS= bhyvectl.c
PACKAGE= bhyve
@@ -14,4 +16,8 @@ WARNS?= 3
CFLAGS+= -I${SRCTOP}/sys/amd64/vmm
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+CFLAGS+= -DBHYVE_SNAPSHOT
+.endif
+
.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyvectl/bhyvectl.8 b/usr.sbin/bhyvectl/bhyvectl.8
index 035f9f6c7586..6adf87ca4537 100644
--- a/usr.sbin/bhyvectl/bhyvectl.8
+++ b/usr.sbin/bhyvectl/bhyvectl.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd November 13, 2016
+.Dd May 04, 2020
.Dt BHYVECTL 8
.Os
.Sh NAME
@@ -39,6 +39,8 @@
.Op Fl -inject-nmi
.Op Fl -force-reset
.Op Fl -force-poweroff
+.Op Fl -checkpoint= Ns Ar <filename>
+.Op Fl -suspend= Ns Ar <filename>
.Sh DESCRIPTION
The
.Nm
@@ -72,6 +74,17 @@ Inject a non-maskable interrupt (NMI) into the VM.
Force the VM to reset.
.It Fl -force-poweroff
Force the VM to power off.
+.It Fl -checkpoint= Ns Ar <filename>
+Save a snapshot of a virtual machine.
+The guest memory contents are saved in the file given in
+.Ar <filename> .
+The guest device and vCPU state are saved in the file
+.Ar <filename>.kern .
+.It Fl -suspend= Ns Ar <filename>
+Save a snapshot of a virtual machine similar to
+.Fl -checkpoint .
+The virtual machine will terminate after the snapshot has been
+saved.
.El
.Sh EXIT STATUS
.Ex -std
@@ -79,6 +92,10 @@ Force the VM to power off.
Destroy the VM called fbsd10:
.Pp
.Dl "bhyvectl --vm=fbsd10 --destroy"
+.Sh COMPATIBILITY
+The snapshot file format is not yet stable and is subject to future changes.
+Backwards compatibility support for the current snapshot file format is not
+guaranteed when future changes are made.
.Sh SEE ALSO
.Xr bhyve 8 ,
.Xr bhyveload 8
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index 8274e6eafccb..d2c4a1488fe8 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm_dev.h>
#include <vmmapi.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
#include "amd/vmcb.h"
#include "intel/vmcs.h"
@@ -67,6 +70,9 @@ __FBSDID("$FreeBSD$");
#define NO_ARG no_argument
#define OPT_ARG optional_argument
+#define CHECKPOINT_RUN_DIR "/var/run/bhyve/checkpoint"
+#define MAX_VMNAME 100
+
static const char *progname;
static void
@@ -78,6 +84,10 @@ usage(bool cpu_intel)
" [--cpu=<vcpu_number>]\n"
" [--create]\n"
" [--destroy]\n"
+#ifdef BHYVE_SNAPSHOT
+ " [--checkpoint=<filename>]\n"
+ " [--suspend=<filename>]\n"
+#endif
" [--get-all]\n"
" [--get-stats]\n"
" [--set-desc-ds]\n"
@@ -287,6 +297,10 @@ enum x2apic_state x2apic_state;
static int unassign_pptdev, bus, slot, func;
static int run;
static int get_cpu_topology;
+#ifdef BHYVE_SNAPSHOT
+static int vm_checkpoint_opt;
+static int vm_suspend_opt;
+#endif
/*
* VMCB specific.
@@ -591,6 +605,10 @@ enum {
SET_RTC_TIME,
SET_RTC_NVRAM,
RTC_NVRAM_OFFSET,
+#ifdef BHYVE_SNAPSHOT
+ SET_CHECKPOINT_FILE,
+ SET_SUSPEND_FILE,
+#endif
};
static void
@@ -1461,6 +1479,10 @@ setup_options(bool cpu_intel)
{ "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 },
{ "get-intinfo", NO_ARG, &get_intinfo, 1 },
{ "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 },
+#ifdef BHYVE_SNAPSHOT
+ { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE},
+ { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE},
+#endif
};
const struct option intel_opts[] = {
@@ -1678,6 +1700,82 @@ show_memseg(struct vmctx *ctx)
}
}
+#ifdef BHYVE_SNAPSHOT
+static int
+send_checkpoint_op_req(struct vmctx *ctx, struct checkpoint_op *op)
+{
+ struct sockaddr_un addr;
+ int socket_fd, len, len_sent, total_sent;
+ int err = 0;
+ char vmname_buf[MAX_VMNAME];
+
+ socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (socket_fd < 0) {
+ perror("Error creating bhyvectl socket");
+ err = -1;
+ goto done;
+ }
+
+ memset(&addr, 0, sizeof(struct sockaddr_un));
+ addr.sun_family = AF_UNIX;
+
+ err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+ if (err != 0) {
+ perror("Failed to get VM name");
+ goto done;
+ }
+
+ snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", CHECKPOINT_RUN_DIR, vmname_buf);
+
+ if (connect(socket_fd, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_un)) != 0) {
+ perror("Connect to VM socket failed");
+ err = -1;
+ goto done;
+ }
+
+ len = sizeof(*op);
+ total_sent = 0;
+ while ((len_sent = send(socket_fd, (char *)op + total_sent, len - total_sent, 0)) > 0) {
+ total_sent += len_sent;
+ }
+
+ if (len_sent < 0) {
+ perror("Failed to send checkpoint operation request");
+ err = -1;
+ }
+
+done:
+ if (socket_fd > 0)
+ close(socket_fd);
+ return (err);
+}
+
+static int
+send_start_checkpoint(struct vmctx *ctx, const char *checkpoint_file)
+{
+ struct checkpoint_op op;
+
+ op.op = START_CHECKPOINT;
+ strncpy(op.snapshot_filename, checkpoint_file, MAX_SNAPSHOT_VMNAME);
+ op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0;
+
+ return (send_checkpoint_op_req(ctx, &op));
+}
+
+static int
+send_start_suspend(struct vmctx *ctx, const char *suspend_file)
+{
+ struct checkpoint_op op;
+
+ op.op = START_SUSPEND;
+ strncpy(op.snapshot_filename, suspend_file, MAX_SNAPSHOT_VMNAME);
+ op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0;
+
+ return (send_checkpoint_op_req(ctx, &op));
+}
+#endif
+
int
main(int argc, char *argv[])
{
@@ -1694,6 +1792,9 @@ main(int argc, char *argv[])
uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
struct tm tm;
struct option *opts;
+#ifdef BHYVE_SNAPSHOT
+ char *checkpoint_file, *suspend_file;
+#endif
cpu_intel = cpu_vendor_intel();
opts = setup_options(cpu_intel);
@@ -1860,6 +1961,16 @@ main(int argc, char *argv[])
case ASSERT_LAPIC_LVT:
assert_lapic_lvt = atoi(optarg);
break;
+#ifdef BHYVE_SNAPSHOT
+ case SET_CHECKPOINT_FILE:
+ vm_checkpoint_opt = 1;
+ checkpoint_file = optarg;
+ break;
+ case SET_SUSPEND_FILE:
+ vm_suspend_opt = 1;
+ suspend_file = optarg;
+ break;
+#endif
default:
usage(cpu_intel);
}
@@ -2345,6 +2456,14 @@ main(int argc, char *argv[])
if (!error && destroy)
vm_destroy(ctx);
+#ifdef BHYVE_SNAPSHOT
+ if (!error && vm_checkpoint_opt)
+ error = send_start_checkpoint(ctx, checkpoint_file);
+
+ if (!error && vm_suspend_opt)
+ error = send_start_suspend(ctx, suspend_file);
+#endif
+
free (opts);
exit(error);
}