aboutsummaryrefslogtreecommitdiff
path: root/sys/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/acpica/acpi_wakeup.c4
-rw-r--r--sys/amd64/amd64/apic_vector.S11
-rw-r--r--sys/amd64/amd64/elf_machdep.c14
-rw-r--r--sys/amd64/amd64/genassym.c12
-rw-r--r--sys/amd64/amd64/kexec_support.c300
-rw-r--r--sys/amd64/amd64/kexec_tramp.S91
-rw-r--r--sys/amd64/amd64/machdep.c33
-rw-r--r--sys/amd64/amd64/mp_machdep.c13
-rw-r--r--sys/amd64/amd64/support.S16
-rw-r--r--sys/amd64/amd64/trap.c4
-rw-r--r--sys/amd64/conf/GENERIC6
-rw-r--r--sys/amd64/conf/MINIMAL1
-rw-r--r--sys/amd64/include/cpufunc.h11
-rw-r--r--sys/amd64/include/kexec.h41
-rw-r--r--sys/amd64/include/md_var.h4
-rw-r--r--sys/amd64/include/param.h11
-rw-r--r--sys/amd64/include/smp.h1
-rw-r--r--sys/amd64/include/vmm.h103
-rw-r--r--sys/amd64/linux/linux_sysvec.c12
-rw-r--r--sys/amd64/linux32/linux32_sysvec.c12
-rw-r--r--sys/amd64/pt/pt.c221
-rw-r--r--sys/amd64/sgx/sgx_linux.c11
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S8
-rw-r--r--sys/amd64/vmm/vmm.c227
-rw-r--r--sys/amd64/vmm/vmm_dev_machdep.c256
25 files changed, 1081 insertions, 342 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 99565fbb69ca..8cada2f4f911 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -74,7 +74,7 @@ extern int acpi_susp_bounce;
extern struct susppcb **susppcbs;
static cpuset_t suspcpus;
-static void acpi_stop_beep(void *);
+static void acpi_stop_beep(void *, enum power_stype);
static int acpi_wakeup_ap(struct acpi_softc *, int);
static void acpi_wakeup_cpus(struct acpi_softc *);
@@ -88,7 +88,7 @@ static void acpi_wakeup_cpus(struct acpi_softc *);
} while (0)
static void
-acpi_stop_beep(void *arg)
+acpi_stop_beep(void *arg, enum power_stype stype)
{
if (acpi_resume_beep != 0)
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index e98bae9eb6c5..8691387a5a8e 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -204,6 +204,17 @@ IDTVEC(spuriousint)
jmp doreti
/*
+ * Executed by a CPU when it receives an IPI_OFF from another CPU.
+ * Should never return
+ */
+ INTR_HANDLER cpuoff
+ KMSAN_ENTER
+ call cpuoff_handler
+ call as_lapic_eoi
+ KMSAN_LEAVE
+ jmp doreti
+
+/*
* Executed by a CPU when it receives an IPI_SWI.
*/
INTR_HANDLER ipi_swi
diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
index 6cc2d58bbbcc..933f1ac0051f 100644
--- a/sys/amd64/amd64/elf_machdep.c
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -179,7 +179,7 @@ freebsd_brand_info_la57_img_compat(const struct image_params *imgp,
return (!prefer_uva_la48);
}
-static Elf64_Brandinfo freebsd_brand_info_la48 = {
+static const Elf64_Brandinfo freebsd_brand_info_la48 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -190,7 +190,7 @@ static Elf64_Brandinfo freebsd_brand_info_la48 = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
};
-static Elf64_Brandinfo freebsd_brand_info_la57 = {
+static const Elf64_Brandinfo freebsd_brand_info_la57 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -216,7 +216,7 @@ sysinit_register_elf64_brand_entries(void *arg __unused)
SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
sysinit_register_elf64_brand_entries, NULL);
-static Elf64_Brandinfo freebsd_brand_oinfo = {
+static const Elf64_Brandinfo freebsd_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -226,11 +226,10 @@ static Elf64_Brandinfo freebsd_brand_oinfo = {
.brand_note = &elf64_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-
-SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
(sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo);
-static Elf64_Brandinfo kfreebsd_brand_info = {
+static const Elf64_Brandinfo kfreebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -240,8 +239,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = {
.brand_note = &elf64_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
-
-SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+C_SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
(sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info);
void
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index eb1b746f5893..2716784ee871 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -57,6 +57,7 @@
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <sys/kexec.h>
#include <sys/proc.h>
#include <x86/apicreg.h>
#include <machine/cpu.h>
@@ -65,6 +66,7 @@
#include <machine/proc.h>
#include <machine/segments.h>
#include <machine/efi.h>
+#include <machine/kexec.h>
ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13));
ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14));
ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15));
ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags));
+
+/* Kexec */
+ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry));
+ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments));
+ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX);
+ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image));
+ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage));
+ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size));
+ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf));
+ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target));
diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c
new file mode 100644
index 000000000000..8189a48e9ae9
--- /dev/null
+++ b/sys/amd64/amd64/kexec_support.c
@@ -0,0 +1,300 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/kexec.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <x86/apicvar.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ * disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+/*
+ * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages.
+ * identity: This is for an identity map, treat `start` as a physical address.
+ * Only valid here if do_pte is false.
+ */
+static void
+kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start,
+ vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages)
+{
+ vm_paddr_t mpa;
+ vm_offset_t pg;
+ vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR;
+ vm_page_t m;
+ vm_pindex_t i, j, k, l;
+
+ pg = start & ~(stride - 1);
+ i = pmap_pml4e_index(pg);
+ j = pmap_pdpe_index(pg);
+ k = pmap_pde_index(pg);
+ l = pmap_pte_index(pg);
+ for (; pg < start + size; i++, j = 0, k = 0, l = 0) {
+ /*
+ * Walk linearly, as above, but one fell swoop, one page at a
+ * time.
+ */
+ if (root[i] == 0) {
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ root[i] = mpa | PG_RW | PG_V;
+ }
+ pdp_entry_t *pdp =
+ (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME));
+ for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) {
+ if (pdp[j] == 0) {
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ pdp[j] = mpa | PG_RW | PG_V;
+ }
+ pd_entry_t *pde =
+ (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME));
+ for (; k < NPDEPG && pg < start + size; k++, l = 0) {
+ if (pde[k] == 0) {
+ if (!do_pte) {
+ pde[k] =
+ (identity ? pg : pmap_kextract(pg)) |
+ PG_RW | PG_PS | PG_V;
+ pg += NBPDR;
+ continue;
+ }
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ pde[k] = mpa | PG_V | PG_RW;
+ } else if ((pde[k] & PG_PS) != 0) {
+ pg += NBPDR;
+ continue;
+ }
+ /* Populate the PTEs. */
+ for (; l < NPTEPG && pg < start + size;
+ l++, pg += PAGE_SIZE) {
+ pt_entry_t *pte =
+ (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME);
+ pte[pmap_pte_index(pg)] =
+ pmap_kextract(pg) | PG_RW | PG_V;
+ }
+ }
+ }
+ }
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+ void (*kexec_do_tramp)(void) = image->md_image;
+
+ intr_disable_all();
+ lapic_disable();
+ kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page),
+ kexec_do_tramp);
+
+ for (;;)
+ ;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ struct pctrie_iter pct_iter;
+ pml4_entry_t *PT4;
+ pdp_entry_t *PDP_l;
+ pd_entry_t *PD_l0;
+ vm_offset_t va;
+ int i;
+
+ /*
+ * Start building the page table.
+ * First part of the page table is standard for all.
+ */
+ vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3;
+ vm_page_t m;
+
+ if (la57)
+ return (EINVAL);
+
+ vm_radix_iter_init(&pct_iter, &image->map_obj->rtree);
+ /* Working in linear space in the mapped space, `va` is our tracker. */
+ m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex);
+ va = (vm_offset_t)image->map_addr + ptoa(m->pindex);
+ /* We'll find a place for these later */
+ PT4 = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pdp_l = VM_PAGE_TO_PHYS(m);
+ PDP_l = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l0 = VM_PAGE_TO_PHYS(m);
+ PD_l0 = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l1 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l2 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l3 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+
+ /* 1:1 mapping of lower 4G */
+ PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW;
+ PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW;
+ PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW;
+ PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW;
+ PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW;
+ for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */
+ PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+ PG_RW | PG_PS;
+ }
+
+ /* Map the target(s) in 2MB chunks. */
+ for (i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ struct kexec_segment_stage *s = &image->segments[i];
+
+ if (s->size == 0)
+ break;
+ kexec_generate_page_tables(PT4, s->target, s->size, false,
+ true, &pct_iter);
+ }
+ /* Now create the source page tables */
+ kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true,
+ false, &pct_iter);
+ kexec_generate_page_tables(PT4,
+ trunc_page((vm_offset_t)kexec_do_reboot_trampoline),
+ PAGE_SIZE, true, false, &pct_iter);
+ KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n"));
+
+ /* MD control pages start at this next page. */
+ image->md_image = (void *)(image->map_addr + ptoa(m->pindex));
+ bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size);
+
+ /* Save the image into the MD page(s) right after the trampoline */
+ bcopy(image, (void *)((vm_offset_t)image->md_image +
+ (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot),
+ sizeof(*image));
+
+ return (0);
+}
+
+/*
+ * Required pages:
+ * - L4 (1) (root)
+ * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map)
+ * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1)
+ * - L1 (PDR) - 1 (kexec trampoline page, first MD page)
+ * - kexec_do_reboot trampoline - 1
+ * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case)
+ *
+ * Minimum 9 pages for the direct map.
+ */
+int
+kexec_md_pages(struct kexec_segment *seg_in)
+{
+ struct kexec_segment *segs = seg_in;
+ vm_size_t pages = 13; /* Minimum number of starting pages */
+ vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */
+ vm_size_t source_total = 0;
+
+ for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ vm_offset_t start, end;
+ if (segs[i].memsz == 0)
+ break;
+
+ end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz);
+ start = trunc_2mpage((vm_offset_t)segs[i].mem);
+ start = max(start, cur_addr + 1);
+ /*
+ * Round to cover the full range of page table pages for each
+ * segment.
+ */
+ source_total += round_2mpage(end - start);
+
+ /*
+ * Bottom 4GB are identity mapped already in the count, so skip
+ * any segments that end up there, this will short-circuit that.
+ */
+ if (end <= cur_addr + 1)
+ continue;
+
+ if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) {
+ /* Need a new 512GB mapping page */
+ pages++;
+ pages += howmany(end - (start & ~PML4MASK), NBPML4);
+ pages += howmany(end - (start & ~PDPMASK), NBPDP);
+ pages += howmany(end - (start & ~PDRMASK), NBPDR);
+
+ } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) {
+ pages++;
+ pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1;
+ pages += howmany(end - (start & ~PDRMASK), NBPDR);
+ }
+
+ }
+ /* Be pessimistic when totaling up source pages. We likely
+ * can't use superpages, so need to map each page individually.
+ */
+ pages += howmany(source_total, NBPDR);
+ pages += howmany(source_total, NBPDP);
+ pages += howmany(source_total, NBPML4);
+
+ /*
+ * Be intentionally sloppy adding in the extra page table pages. It's
+ * better to go over than under.
+ */
+ pages += howmany(pages * PAGE_SIZE, NBPDR);
+ pages += howmany(pages * PAGE_SIZE, NBPDP);
+ pages += howmany(pages * PAGE_SIZE, NBPML4);
+
+ /* Add in the trampoline pages */
+ pages += howmany(kexec_do_reboot_size, PAGE_SIZE);
+
+ return (pages);
+}
diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S
new file mode 100644
index 000000000000..6a2de676bc35
--- /dev/null
+++ b/sys/amd64/amd64/kexec_tramp.S
@@ -0,0 +1,91 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+#include "assym.inc"
+
+/*
+ * Take a pointer to the image, copy each segment, and jump to the trampoline.
+ *
+ * Assumptions:
+ * - image is in safe memory
+ * - We're already running out of the new "identity" map.
+ * - All registers are free game, so go nuts
+ * - Interrupts are disabled
+ * - All APs are disabled
+ */
+ENTRY(kexec_do_reboot)
+ /*
+ r9: image pointer
+ r10: segment pointer
+ r11: segment counter
+ */
+ leaq kexec_stack(%rip), %rsp
+ /* Get the saved kexec_image. */
+ leaq kexec_saved_image(%rip), %r9
+ leaq KEXEC_SEGMENTS(%r9), %r10
+ movq $KEXEC_SEGMENT_MAX, %r11
+copy_segment:
+ movq KEXEC_SEGMENT_SIZE(%r10), %rcx
+ cmpq $0, %rcx
+ je done
+ shrq $3, %rcx
+ movq KEXEC_SEGMENT_TARGET(%r10), %rdi
+ movq KEXEC_SEGMENT_MAP(%r10), %rsi
+ rep
+ movsq
+ addq $KEXEC_STAGED_SEGMENT_SIZE, %r10
+ decq %r11
+ jg copy_segment
+
+done:
+ pushq KEXEC_ENTRY(%r9)
+ ret
+fail:
+ jmp fail
+END(kexec_do_reboot)
+ENTRY(kexec_do_reboot_trampoline)
+ /* Set new page table, clears most of TLB. */
+ movq %rdi, %cr3
+
+ /* Now flush the rest of the TLB, including global pages. */
+ movq %cr4, %rax
+ andq $~CR4_PGE, %rax
+ movq %rax, %cr4
+ jmp *%rsi
+END(kexec_do_reboot_trampoline)
+CNAME(kexec_saved_image):
+ .globl kexec_saved_image
+ .space KEXEC_IMAGE_SIZE
+ .quad 0
+ /* We don't need more than quad, so just fill out the page. */
+ .p2align PAGE_SHIFT
+ kexec_stack:
+CNAME(kexec_do_reboot_size):
+ .globl kexec_do_reboot_size
+ .quad . - kexec_do_reboot
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 9ff60439d1ec..2fce1a7e64b6 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1822,6 +1822,39 @@ clear_pcb_flags(struct pcb *pcb, const u_int flags)
: "cc", "memory");
}
+extern const char wrmsr_early_safe_gp_handler[];
+static struct region_descriptor wrmsr_early_safe_orig_efi_idt;
+
+void
+wrmsr_early_safe_start(void)
+{
+ struct region_descriptor efi_idt;
+ struct gate_descriptor *gpf_descr;
+
+ sidt(&wrmsr_early_safe_orig_efi_idt);
+ efi_idt.rd_limit = 32 * sizeof(idt0[0]);
+ efi_idt.rd_base = (uintptr_t)idt0;
+ lidt(&efi_idt);
+
+ gpf_descr = &idt0[IDT_GP];
+ gpf_descr->gd_looffset = (uintptr_t)wrmsr_early_safe_gp_handler;
+ gpf_descr->gd_hioffset = (uintptr_t)wrmsr_early_safe_gp_handler >> 16;
+ gpf_descr->gd_selector = rcs();
+ gpf_descr->gd_type = SDT_SYSTGT;
+ gpf_descr->gd_p = 1;
+}
+
+void
+wrmsr_early_safe_end(void)
+{
+ struct gate_descriptor *gpf_descr;
+
+ lidt(&wrmsr_early_safe_orig_efi_idt);
+
+ gpf_descr = &idt0[IDT_GP];
+ memset(gpf_descr, 0, sizeof(*gpf_descr));
+}
+
#ifdef KDB
/*
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 00e99f9df192..96ed0a2cc3ba 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -140,6 +140,10 @@ cpu_mp_start(void)
setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
SDT_SYSIGT, SEL_KPL, 0);
+ /* Install an inter-CPU IPI for CPU offline */
+ setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff),
+ SDT_SYSIGT, SEL_KPL, 0);
+
/* Install an inter-CPU IPI for CPU suspend/resume */
setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
SDT_SYSIGT, SEL_KPL, 0);
@@ -176,6 +180,15 @@ cpu_mp_start(void)
#endif
}
+void
+cpu_mp_stop(void)
+{
+ cpuset_t other_cpus = all_cpus;
+
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ offline_cpus(other_cpus);
+}
+
/*
* AP CPU's call this to initialize themselves.
*/
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index 870cd255abb7..27694a95653c 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -1565,6 +1565,22 @@ msr_onfault:
POP_FRAME_POINTER
ret
+ENTRY(wrmsr_early_safe)
+ movl %edi,%ecx
+ movl %esi,%eax
+ sarq $32,%rsi
+ movl %esi,%edx
+ wrmsr
+ xorl %eax,%eax
+wrmsr_early_faulted:
+ ret
+
+ENTRY(wrmsr_early_safe_gp_handler)
+ addq $8,%rsp
+ movl $EFAULT,%eax
+ movq $wrmsr_early_faulted,(%rsp)
+ iretq
+
/*
* void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
* Invalidates address space addressed by ucr3, then returns to kcr3.
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index f3469ed5e2bc..84305ca918df 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -435,9 +435,9 @@ trap(struct trapframe *frame)
if ((print_efirt_faults == 1 && cnt == 0) ||
print_efirt_faults == 2) {
- trap_diag(frame, 0);
printf("EFI RT fault %s\n",
traptype_to_msg(type));
+ trap_diag(frame, 0);
}
frame->tf_rip = (long)curpcb->pcb_onfault;
return;
@@ -870,8 +870,8 @@ after_vmfault:
if ((print_efirt_faults == 1 && cnt == 0) ||
print_efirt_faults == 2) {
- trap_diag(frame, eva);
printf("EFI RT page fault\n");
+ trap_diag(frame, eva);
}
}
frame->tf_rip = (long)curpcb->pcb_onfault;
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 81427b5b18b6..fb8473505128 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -26,7 +26,7 @@ makeoptions WITH_CTF=1 # Run ctfconvert(1) for DTrace support
options SCHED_ULE # ULE scheduler
options NUMA # Non-Uniform Memory Architecture support
options PREEMPTION # Enable kernel thread preemption
-options BLOAT_KERNEL_WITH_EXTERR
+options EXTERR_STRINGS
options VIMAGE # Subsystem virtualization, e.g. VNET
options INET # InterNETworking
options INET6 # IPv6 communications protocols
@@ -287,9 +287,9 @@ device wlan # 802.11 support
options IEEE80211_DEBUG # enable debug msgs
options IEEE80211_SUPPORT_MESH # enable 802.11s draft support
device wlan_wep # 802.11 WEP support
+device wlan_tkip # 802.11 TKIP support
device wlan_ccmp # 802.11 CCMP support
device wlan_gcmp # 802.11 GCMP support
-device wlan_tkip # 802.11 TKIP support
device wlan_amrr # AMRR transmit rate control algorithm
device ath # Atheros CardBus/PCI NICs
device ath_hal # Atheros CardBus/PCI chip support
@@ -309,7 +309,6 @@ device wpi # Intel 3945ABG wireless NICs.
device crypto # core crypto support
device aesni # AES-NI OpenCrypto module
device loop # Network loopback
-device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device ether # Ethernet support
device vlan # 802.1Q VLAN support
@@ -386,7 +385,6 @@ options HID_DEBUG # enable debug msgs
device hid # Generic HID support
device hidbus # Generic HID Bus
options IICHID_SAMPLING # Workaround missing GPIO INTR support
-options U2F_MAKE_UHID_ALIAS # install /dev/uhid alias for /dev/u2f/
# EFI devices
device efidev # EFI pseudo-device
diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL
index 0baf6d6431de..61c713c609a4 100644
--- a/sys/amd64/conf/MINIMAL
+++ b/sys/amd64/conf/MINIMAL
@@ -113,7 +113,6 @@ device uart # Generic UART driver
# Pseudo devices.
device loop # Network loopback
-device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device ether # Ethernet support
diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index d180f5c76afb..9a4c82275a99 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -76,7 +76,7 @@ static __inline void
clflushopt(u_long addr)
{
- __asm __volatile(".byte 0x66;clflush %0" : : "m" (*(char *)addr));
+ __asm __volatile("clflushopt %0" : : "m" (*(char *)addr));
}
static __inline void
@@ -572,6 +572,15 @@ rss(void)
return (sel);
}
+static __inline u_short
+rcs(void)
+{
+ u_short sel;
+
+ __asm __volatile("movw %%cs,%0" : "=rm" (sel));
+ return (sel);
+}
+
static __inline void
load_ds(u_short sel)
{
diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h
new file mode 100644
index 000000000000..70bc2991be3f
--- /dev/null
+++ b/sys/amd64/include/kexec.h
@@ -0,0 +1,41 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_KEXEC_H_
+#define _AMD64_KEXEC_H_
+
+struct kexec_segment;
+struct kexec_image;
+int kexec_md_pages(struct kexec_segment *);
+extern void kexec_do_reboot(void);
+extern long kexec_do_reboot_size;
+extern void *kexec_saved_image;
+extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void));
+#define KEXEC_MD_PAGES(x) kexec_md_pages(x)
+
+
+#endif /* _AMD64_KEXEC_H_ */
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index b6ddc6eaaebe..b6d8c469cdf6 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -99,6 +99,10 @@ void get_fpcontext(struct thread *td, struct __mcontext *mcp,
int set_fpcontext(struct thread *td, struct __mcontext *mcp,
char *xfpustate, size_t xfpustate_len);
+void wrmsr_early_safe_start(void);
+void wrmsr_early_safe_end(void);
+int wrmsr_early_safe(u_int msr, uint64_t data);
+
#endif /* !_MACHINE_MD_VAR_H_ */
#endif /* __i386__ */
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 5a9c3162e14c..0654bb9de790 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -150,6 +150,15 @@
(((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
-#define SC_TABLESIZE 1024 /* Must be power of 2. */
+/*
+ * Must be power of 2.
+ *
+ * Perhaps should be autosized on boot based on found ncpus.
+ */
+#if MAXCPU > 256
+#define SC_TABLESIZE 2048
+#else
+#define SC_TABLESIZE 1024
+#endif
#endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index bff92570ff82..28c372a2e556 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -30,6 +30,7 @@ inthand_t
IDTVEC(ipi_intr_bitmap_handler_pti),
IDTVEC(ipi_swi_pti),
IDTVEC(cpustop_pti),
+ IDTVEC(cpuoff_pti),
IDTVEC(cpususpend_pti),
IDTVEC(rendezvous_pti);
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 0b3daed4f69e..ad67510fecf3 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -46,6 +46,7 @@ enum vm_suspend_how {
VM_SUSPEND_POWEROFF,
VM_SUSPEND_HALT,
VM_SUSPEND_TRIPLEFAULT,
+ VM_SUSPEND_DESTROY,
VM_SUSPEND_LAST
};
@@ -169,55 +170,63 @@ struct vm_eventinfo {
int *iptr; /* reqidle cookie */
};
-typedef int (*vmm_init_func_t)(int ipinum);
-typedef int (*vmm_cleanup_func_t)(void);
-typedef void (*vmm_suspend_func_t)(void);
-typedef void (*vmm_resume_func_t)(void);
-typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
-typedef int (*vmi_run_func_t)(void *vcpui, register_t rip,
- struct pmap *pmap, struct vm_eventinfo *info);
-typedef void (*vmi_cleanup_func_t)(void *vmi);
-typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu,
- int vcpu_id);
-typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui);
-typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval);
-typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val);
-typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc);
-typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc);
-typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval);
-typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val);
-typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
-typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
-typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui);
-typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic);
-typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta);
-typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now);
+#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \
+ typedef ret_type (*vmmops_##opname##_t) args; \
+ ret_type vmmops_##opname args
+
+DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum));
+DECLARE_VMMOPS_FUNC(int, modcleanup, (void));
+DECLARE_VMMOPS_FUNC(void, modresume, (void));
+DECLARE_VMMOPS_FUNC(void, modsuspend, (void));
+DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap));
+DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc,
+ struct pmap *pmap, struct vm_eventinfo *info));
+DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi));
+DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu,
+ int vcpu_id));
+DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui));
+DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval));
+DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val));
+DECLARE_VMMOPS_FUNC(int, getdesc, (void *vcpui, int num,
+ struct seg_desc *desc));
+DECLARE_VMMOPS_FUNC(int, setdesc, (void *vcpui, int num,
+ struct seg_desc *desc));
+DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval));
+DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val));
+DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc,
+ (vm_offset_t min, vm_offset_t max));
+DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace));
+DECLARE_VMMOPS_FUNC(struct vlapic *, vlapic_init, (void *vcpui));
+DECLARE_VMMOPS_FUNC(void, vlapic_cleanup, (struct vlapic *vlapic));
+DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui,
+ struct vm_snapshot_meta *meta));
+DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now));
struct vmm_ops {
- vmm_init_func_t modinit; /* module wide initialization */
- vmm_cleanup_func_t modcleanup;
- vmm_resume_func_t modsuspend;
- vmm_resume_func_t modresume;
-
- vmi_init_func_t init; /* vm-specific initialization */
- vmi_run_func_t run;
- vmi_cleanup_func_t cleanup;
- vmi_vcpu_init_func_t vcpu_init;
- vmi_vcpu_cleanup_func_t vcpu_cleanup;
- vmi_get_register_t getreg;
- vmi_set_register_t setreg;
- vmi_get_desc_t getdesc;
- vmi_set_desc_t setdesc;
- vmi_get_cap_t getcap;
- vmi_set_cap_t setcap;
- vmi_vmspace_alloc vmspace_alloc;
- vmi_vmspace_free vmspace_free;
- vmi_vlapic_init vlapic_init;
- vmi_vlapic_cleanup vlapic_cleanup;
+ vmmops_modinit_t modinit; /* module wide initialization */
+ vmmops_modcleanup_t modcleanup;
+ vmmops_modresume_t modsuspend;
+ vmmops_modresume_t modresume;
+
+ vmmops_init_t init; /* vm-specific initialization */
+ vmmops_run_t run;
+ vmmops_cleanup_t cleanup;
+ vmmops_vcpu_init_t vcpu_init;
+ vmmops_vcpu_cleanup_t vcpu_cleanup;
+ vmmops_getreg_t getreg;
+ vmmops_setreg_t setreg;
+ vmmops_getdesc_t getdesc;
+ vmmops_setdesc_t setdesc;
+ vmmops_getcap_t getcap;
+ vmmops_setcap_t setcap;
+ vmmops_vmspace_alloc_t vmspace_alloc;
+ vmmops_vmspace_free_t vmspace_free;
+ vmmops_vlapic_init_t vlapic_init;
+ vmmops_vlapic_cleanup_t vlapic_cleanup;
/* checkpoint operations */
- vmi_snapshot_vcpu_t vcpu_snapshot;
- vmi_restore_tsc_t restore_tsc;
+ vmmops_vcpu_snapshot_t vcpu_snapshot;
+ vmmops_restore_tsc_t restore_tsc;
};
extern const struct vmm_ops vmm_ops_intel;
@@ -228,7 +237,7 @@ extern u_int vm_maxcpu; /* maximum virtual cpus */
int vm_create(const char *name, struct vm **retvm);
struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
void vm_disable_vcpu_creation(struct vm *vm);
-void vm_slock_vcpus(struct vm *vm);
+void vm_lock_vcpus(struct vm *vm);
void vm_unlock_vcpus(struct vm *vm);
void vm_destroy(struct vm *vm);
int vm_reinit(struct vm *vm);
@@ -353,6 +362,7 @@ enum vcpu_state {
};
int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle);
+int vcpu_set_state_all(struct vm *vm, enum vcpu_state state);
enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu);
static int __inline
@@ -374,7 +384,6 @@ vcpu_should_yield(struct vcpu *vcpu)
void *vcpu_stats(struct vcpu *vcpu);
void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr);
-struct vmspace *vm_vmspace(struct vm *vm);
struct vm_mem *vm_mem(struct vm *vm);
struct vatpic *vm_atpic(struct vm *vm);
struct vatpit *vm_atpit(struct vm *vm);
diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c
index c8579c5da4ad..890cf01c46a0 100644
--- a/sys/amd64/linux/linux_sysvec.c
+++ b/sys/amd64/linux/linux_sysvec.c
@@ -857,7 +857,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset)
}
}
-static Elf_Brandnote linux64_brandnote = {
+static const Elf_Brandnote linux64_brandnote = {
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
.hdr.n_descsz = 16,
.hdr.n_type = 1,
@@ -866,7 +866,7 @@ static Elf_Brandnote linux64_brandnote = {
.trans_osrel = linux_trans_osrel
};
-static Elf64_Brandinfo linux_glibc2brand = {
+static const Elf64_Brandinfo linux_glibc2brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_X86_64,
.compat_3_brand = "Linux",
@@ -877,7 +877,7 @@ static Elf64_Brandinfo linux_glibc2brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf64_Brandinfo linux_glibc2brandshort = {
+static const Elf64_Brandinfo linux_glibc2brandshort = {
.brand = ELFOSABI_LINUX,
.machine = EM_X86_64,
.compat_3_brand = "Linux",
@@ -888,7 +888,7 @@ static Elf64_Brandinfo linux_glibc2brandshort = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf64_Brandinfo linux_muslbrand = {
+static const Elf64_Brandinfo linux_muslbrand = {
.brand = ELFOSABI_LINUX,
.machine = EM_X86_64,
.compat_3_brand = "Linux",
@@ -900,7 +900,7 @@ static Elf64_Brandinfo linux_muslbrand = {
LINUX_BI_FUTEX_REQUEUE
};
-static Elf64_Brandinfo *linux_brandlist[] = {
+static const Elf64_Brandinfo *linux_brandlist[] = {
&linux_glibc2brand,
&linux_glibc2brandshort,
&linux_muslbrand,
@@ -910,7 +910,7 @@ static Elf64_Brandinfo *linux_brandlist[] = {
static int
linux64_elf_modevent(module_t mod, int type, void *data)
{
- Elf64_Brandinfo **brandinfo;
+ const Elf64_Brandinfo **brandinfo;
int error;
struct linux_ioctl_handler **lihp;
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 8fac626f9053..735ebb151017 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -954,7 +954,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset)
}
}
-static Elf_Brandnote linux32_brandnote = {
+static const Elf_Brandnote linux32_brandnote = {
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
.hdr.n_descsz = 16, /* XXX at least 16 */
.hdr.n_type = 1,
@@ -963,7 +963,7 @@ static Elf_Brandnote linux32_brandnote = {
.trans_osrel = linux_trans_osrel
};
-static Elf32_Brandinfo linux_brand = {
+static const Elf32_Brandinfo linux_brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_386,
.compat_3_brand = "Linux",
@@ -974,7 +974,7 @@ static Elf32_Brandinfo linux_brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf32_Brandinfo linux_glibc2brand = {
+static const Elf32_Brandinfo linux_glibc2brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_386,
.compat_3_brand = "Linux",
@@ -985,7 +985,7 @@ static Elf32_Brandinfo linux_glibc2brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf32_Brandinfo linux_muslbrand = {
+static const Elf32_Brandinfo linux_muslbrand = {
.brand = ELFOSABI_LINUX,
.machine = EM_386,
.compat_3_brand = "Linux",
@@ -997,7 +997,7 @@ static Elf32_Brandinfo linux_muslbrand = {
LINUX_BI_FUTEX_REQUEUE
};
-static Elf32_Brandinfo *linux_brandlist[] = {
+static const Elf32_Brandinfo *linux_brandlist[] = {
&linux_brand,
&linux_glibc2brand,
&linux_muslbrand,
@@ -1007,7 +1007,7 @@ static Elf32_Brandinfo *linux_brandlist[] = {
static int
linux_elf_modevent(module_t mod, int type, void *data)
{
- Elf32_Brandinfo **brandinfo;
+ const Elf32_Brandinfo **brandinfo;
int error;
struct linux_ioctl_handler **lihp;
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
index c7b75767680a..6b2296de049c 100644
--- a/sys/amd64/pt/pt.c
+++ b/sys/amd64/pt/pt.c
@@ -42,15 +42,15 @@
*/
#include <sys/systm.h>
+#include <sys/bus.h>
#include <sys/hwt.h>
+#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
-#include <sys/sdt.h>
#include <sys/smp.h>
-#include <sys/taskqueue.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
@@ -94,12 +94,7 @@
MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
-SDT_PROVIDER_DEFINE(pt);
-SDT_PROBE_DEFINE(pt, , , topa__intr);
-
-TASKQUEUE_FAST_DEFINE_THREAD(pt);
-
-static void pt_send_buffer_record(void *arg, int pending __unused);
+static void pt_send_buffer_record(void *arg);
static int pt_topa_intr(struct trapframe *tf);
/*
@@ -122,29 +117,24 @@ struct pt_buffer {
size_t size;
struct mtx lock; /* Lock for fields below. */
vm_offset_t offset;
- uint64_t wrap_count;
- int curpage;
};
struct pt_ctx {
int id;
struct pt_buffer buf; /* ToPA buffer metadata */
- struct task task; /* ToPA buffer notification task */
struct hwt_context *hwt_ctx;
uint8_t *save_area; /* PT XSAVE area */
};
/* PT tracing contexts used for CPU mode. */
static struct pt_ctx *pt_pcpu_ctx;
-enum pt_cpu_state {
- PT_DISABLED = 0,
- PT_STOPPED,
- PT_ACTIVE
-};
+enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE };
static struct pt_cpu {
struct pt_ctx *ctx; /* active PT tracing context */
enum pt_cpu_state state; /* used as part of trace stop protocol */
+ void *swi_cookie; /* Software interrupt handler context */
+ int in_pcint_handler;
} *pt_pcpu;
/*
@@ -199,31 +189,28 @@ static __inline void
pt_update_buffer(struct pt_buffer *buf)
{
uint64_t reg;
- int curpage;
+ uint64_t offset;
/* Update buffer offset. */
reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
- curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
- mtx_lock_spin(&buf->lock);
- /* Check if the output wrapped. */
- if (buf->curpage > curpage)
- buf->wrap_count++;
- buf->curpage = curpage;
- buf->offset = reg >> 32;
- mtx_unlock_spin(&buf->lock);
-
- dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
- buf->wrap_count, buf->curpage, buf->offset);
+ offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE;
+ offset += (reg >> 32);
+
+ atomic_store_rel_64(&buf->offset, offset);
}
static __inline void
pt_fill_buffer_record(int id, struct pt_buffer *buf,
struct hwt_record_entry *rec)
{
+ vm_offset_t offset;
+
+ offset = atomic_load_acq_64(&buf->offset);
+
rec->record_type = HWT_RECORD_BUFFER;
rec->buf_id = id;
- rec->curpage = buf->curpage;
- rec->offset = buf->offset + (buf->wrap_count * buf->size);
+ rec->curpage = offset / PAGE_SIZE;
+ rec->offset = offset & PAGE_MASK;
}
/*
@@ -273,9 +260,9 @@ pt_cpu_start(void *dummy)
MPASS(cpu->ctx != NULL);
dprintf("%s: curcpu %d\n", __func__, curcpu);
+ pt_cpu_set_state(curcpu, PT_ACTIVE);
load_cr4(rcr4() | CR4_XSAVE);
wrmsr(MSR_IA32_RTIT_STATUS, 0);
- pt_cpu_set_state(curcpu, PT_ACTIVE);
pt_cpu_toggle_local(cpu->ctx->save_area, true);
}
@@ -291,16 +278,16 @@ pt_cpu_stop(void *dummy)
struct pt_cpu *cpu;
struct pt_ctx *ctx;
- /* Shutdown may occur before PT gets properly configured. */
- if (pt_cpu_get_state(curcpu) == PT_DISABLED)
- return;
-
cpu = &pt_pcpu[curcpu];
ctx = cpu->ctx;
- MPASS(ctx != NULL);
- dprintf("%s: curcpu %d\n", __func__, curcpu);
- pt_cpu_set_state(curcpu, PT_STOPPED);
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+ /* Shutdown may occur before PT gets properly configured. */
+ if (ctx == NULL) {
+ dprintf("%s: missing context on cpu %d; bailing\n", __func__,
+ curcpu);
+ return;
+ }
pt_cpu_toggle_local(cpu->ctx->save_area, false);
pt_update_buffer(&ctx->buf);
}
@@ -406,13 +393,11 @@ pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
return (ENOMEM);
dprintf("%s: preparing ToPA buffer\n", __func__);
if (pt_topa_prepare(pt_ctx, vm) != 0) {
- dprintf("%s: failed to prepare ToPA buffer\n", __func__);
free(pt_ctx->save_area, M_PT);
return (ENOMEM);
}
pt_ctx->id = ctx_id;
- TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
return (0);
}
@@ -426,7 +411,6 @@ pt_deinit_ctx(struct pt_ctx *pt_ctx)
if (pt_ctx->save_area != NULL)
free(pt_ctx->save_area, M_PT);
memset(pt_ctx, 0, sizeof(*pt_ctx));
- pt_ctx->buf.topa_hw = NULL;
}
/*
@@ -519,7 +503,6 @@ pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
XSTATE_XCOMP_BV_COMPACT;
pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
pt_pcpu[cpu_id].ctx = pt_ctx;
- pt_cpu_set_state(cpu_id, PT_STOPPED);
return (0);
}
@@ -549,12 +532,19 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id)
if (ctx->mode == HWT_MODE_CPU)
return;
-
KASSERT(curcpu == cpu_id,
("%s: attempting to disable PT on another cpu", __func__));
+
+ cpu = &pt_pcpu[cpu_id];
+
+ dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__,
+ cpu_id);
+ pt_cpu_set_state(cpu_id, PT_INACTIVE);
+ while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
+ ;
+
pt_cpu_stop(NULL);
CPU_CLR(cpu_id, &ctx->cpu_map);
- cpu = &pt_pcpu[cpu_id];
cpu->ctx = NULL;
}
@@ -564,14 +554,14 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id)
static int
pt_backend_enable_smp(struct hwt_context *ctx)
{
-
dprintf("%s\n", __func__);
+
+ KASSERT(ctx->mode == HWT_MODE_CPU,
+ ("%s: should only be used for CPU mode", __func__));
if (ctx->mode == HWT_MODE_CPU &&
atomic_swap_32(&cpu_mode_ctr, 1) != 0)
return (-1);
- KASSERT(ctx->mode == HWT_MODE_CPU,
- ("%s: should only be used for CPU mode", __func__));
smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
return (0);
@@ -583,6 +573,7 @@ pt_backend_enable_smp(struct hwt_context *ctx)
static int
pt_backend_disable_smp(struct hwt_context *ctx)
{
+ struct pt_cpu *cpu;
dprintf("%s\n", __func__);
if (ctx->mode == HWT_MODE_CPU &&
@@ -593,6 +584,14 @@ pt_backend_disable_smp(struct hwt_context *ctx)
dprintf("%s: empty cpu map\n", __func__);
return (-1);
}
+ CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
+ cpu = &pt_pcpu[cpu_id];
+ dprintf("%s: waiting for cpu %d to exit interrupt handler\n",
+ __func__, cpu_id);
+ pt_cpu_set_state(cpu_id, PT_INACTIVE);
+ while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
+ ;
+ }
smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
return (0);
@@ -611,13 +610,13 @@ pt_backend_init(struct hwt_context *ctx)
int error;
dprintf("%s\n", __func__);
- if (ctx->mode == HWT_MODE_CPU) {
- TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
- error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
- hwt_cpu->vm, hwt_cpu->cpu_id);
- if (error)
- return (error);
- }
+ if (ctx->mode != HWT_MODE_CPU)
+ return (0);
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm,
+ hwt_cpu->cpu_id);
+ if (error)
+ return (error);
}
return (0);
@@ -647,20 +646,16 @@ pt_backend_deinit(struct hwt_context *ctx)
pt_deinit_ctx(pt_ctx);
}
} else {
- CPU_FOREACH(cpu_id) {
- if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+ CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
+ if (pt_pcpu[cpu_id].ctx == NULL)
continue;
- if (pt_pcpu[cpu_id].ctx != NULL) {
- KASSERT(pt_pcpu[cpu_id].ctx ==
- &pt_pcpu_ctx[cpu_id],
- ("%s: CPU mode tracing with non-cpu mode PT"
- "context active",
- __func__));
- pt_pcpu[cpu_id].ctx = NULL;
- }
- pt_ctx = &pt_pcpu_ctx[cpu_id];
- pt_deinit_ctx(pt_ctx);
- memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+ KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id],
+ ("%s: CPU mode tracing with non-cpu mode PT"
+ "context active",
+ __func__));
+ pt_deinit_ctx(pt_pcpu[cpu_id].ctx);
+ pt_pcpu[cpu_id].ctx = NULL;
+ atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0);
}
}
@@ -675,15 +670,15 @@ pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
uint64_t *data)
{
struct pt_buffer *buf;
+ uint64_t offset;
if (vm->ctx->mode == HWT_MODE_THREAD)
buf = &((struct pt_ctx *)vm->thr->private)->buf;
else
buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
- mtx_lock_spin(&buf->lock);
- *curpage = buf->curpage;
- *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
- mtx_unlock_spin(&buf->lock);
+ offset = atomic_load_acq_64(&buf->offset);
+ *curpage = offset / PAGE_SIZE;
+ *curpage_offset = offset & PAGE_MASK;
return (0);
}
@@ -762,15 +757,13 @@ static struct hwt_backend backend = {
* Used as a taskqueue routine from the ToPA interrupt handler.
*/
static void
-pt_send_buffer_record(void *arg, int pending __unused)
+pt_send_buffer_record(void *arg)
{
+ struct pt_cpu *cpu = (struct pt_cpu *)arg;
struct hwt_record_entry record;
- struct pt_ctx *ctx = (struct pt_ctx *)arg;
- /* Prepare buffer record. */
- mtx_lock_spin(&ctx->buf.lock);
+ struct pt_ctx *ctx = cpu->ctx;
pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
- mtx_unlock_spin(&ctx->buf.lock);
hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
}
static void
@@ -795,36 +788,40 @@ static int
pt_topa_intr(struct trapframe *tf)
{
struct pt_buffer *buf;
+ struct pt_cpu *cpu;
struct pt_ctx *ctx;
uint64_t reg;
- SDT_PROBE0(pt, , , topa__intr);
-
- if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
- return (0);
- }
+ cpu = &pt_pcpu[curcpu];
reg = rdmsr(MSR_IA_GLOBAL_STATUS);
if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
- /* ACK spurious or leftover interrupt. */
pt_topa_status_clear();
+ return (0);
+ }
+
+ if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
return (1);
}
+ atomic_set_int(&cpu->in_pcint_handler, 1);
- ctx = pt_pcpu[curcpu].ctx;
+ ctx = cpu->ctx;
+ KASSERT(ctx != NULL,
+ ("%s: cpu %d: ToPA PMI interrupt without an active context",
+ __func__, curcpu));
buf = &ctx->buf;
KASSERT(buf->topa_hw != NULL,
- ("%s: ToPA PMI interrupt with invalid buffer", __func__));
-
+ ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__,
+ curcpu));
pt_cpu_toggle_local(ctx->save_area, false);
pt_update_buffer(buf);
pt_topa_status_clear();
- taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
- TASKQUEUE_FAIL_IF_PENDING);
if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+ swi_sched(cpu->swi_cookie, SWI_FROMNMI);
pt_cpu_toggle_local(ctx->save_area, true);
lapic_reenable_pcint();
}
+ atomic_set_int(&cpu->in_pcint_handler, 0);
return (1);
}
@@ -839,7 +836,7 @@ static int
pt_init(void)
{
u_int cp[4];
- int error;
+ int error, i;
dprintf("pt: Enumerating part 1\n");
cpuid_count(CPUID_PT_LEAF, 0, cp);
@@ -869,20 +866,38 @@ pt_init(void)
pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
M_ZERO | M_WAITOK);
+ for (i = 0; i < mp_ncpus; i++) {
+ error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record,
+ &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE,
+ &pt_pcpu[i].swi_cookie);
+ if (error != 0) {
+ dprintf(
+ "%s: failed to add interrupt handler for cpu: %d\n",
+ __func__, error);
+ goto err;
+ }
+ }
+
nmi_register_handler(pt_topa_intr);
- if (!lapic_enable_pcint()) {
- nmi_remove_handler(pt_topa_intr);
- hwt_backend_unregister(&backend);
- free(pt_pcpu, M_PT);
- free(pt_pcpu_ctx, M_PT);
- pt_pcpu = NULL;
- pt_pcpu_ctx = NULL;
+ if (lapic_enable_pcint()) {
+ initialized = true;
+ return (0);
+ } else
printf("pt: failed to setup interrupt line\n");
- return (error);
+err:
+ nmi_remove_handler(pt_topa_intr);
+ hwt_backend_unregister(&backend);
+
+ for (i = 0; i < mp_ncpus; i++) {
+ if (pt_pcpu[i].swi_cookie != 0)
+ swi_remove(pt_pcpu[i].swi_cookie);
}
- initialized = true;
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ pt_pcpu_ctx = NULL;
- return (0);
+ return (error);
}
/*
@@ -941,14 +956,24 @@ pt_supported(void)
static void
pt_deinit(void)
{
+ int i;
+ struct pt_cpu *cpu;
+
if (!initialized)
return;
nmi_remove_handler(pt_topa_intr);
lapic_disable_pcint();
hwt_backend_unregister(&backend);
+
+ for (i = 0; i < mp_ncpus; i++) {
+ cpu = &pt_pcpu[i];
+ swi_remove(cpu->swi_cookie);
+ }
+
free(pt_pcpu, M_PT);
free(pt_pcpu_ctx, M_PT);
pt_pcpu = NULL;
+ pt_pcpu_ctx = NULL;
initialized = false;
}
diff --git a/sys/amd64/sgx/sgx_linux.c b/sys/amd64/sgx/sgx_linux.c
index 6ecef9207a38..d389edc1b2b0 100644
--- a/sys/amd64/sgx/sgx_linux.c
+++ b/sys/amd64/sgx/sgx_linux.c
@@ -92,16 +92,7 @@ out:
return (error);
}
-static struct linux_ioctl_handler sgx_linux_handler = {
- sgx_linux_ioctl,
- SGX_LINUX_IOCTL_MIN,
- SGX_LINUX_IOCTL_MAX,
-};
-
-SYSINIT(sgx_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE,
- linux_ioctl_register_handler, &sgx_linux_handler);
-SYSUNINIT(sgx_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
- linux_ioctl_unregister_handler, &sgx_linux_handler);
+LINUX_IOCTL_SET(sgx, SGX_LINUX_IOCTL_MIN, SGX_LINUX_IOCTL_MAX);
static int
sgx_linux_modevent(module_t mod, int type, void *data)
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index 130130b64541..877e377f892d 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -171,13 +171,11 @@ do_launch:
*/
movq %rsp, %rdi /* point %rdi back to 'vmxctx' */
movl $VMX_VMLAUNCH_ERROR, %eax
- jmp decode_inst_error
-
+ /* FALLTHROUGH */
decode_inst_error:
movl $VM_FAIL_VALID, %r11d
- jz inst_error
- movl $VM_FAIL_INVALID, %r11d
-inst_error:
+ movl $VM_FAIL_INVALID, %esi
+ cmovnzl %esi, %r11d
movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi)
/*
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index c42da02d0bf6..473887240b9b 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -163,7 +163,6 @@ struct vm {
void *rendezvous_arg; /* (x) [r] rendezvous func/arg */
vm_rendezvous_func_t rendezvous_func;
struct mtx rendezvous_mtx; /* (o) rendezvous lock */
- struct vmspace *vmspace; /* (o) guest's address space */
struct vm_mem mem; /* (i) [m+v] guest memory */
char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */
struct vcpu **vcpu; /* (o) guest vcpus */
@@ -201,7 +200,7 @@ vmmops_panic(void)
}
#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \
- DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \
+ DEFINE_IFUNC(, ret_type, vmmops_##opname, args) \
{ \
if (vmm_is_intel()) \
return (vmm_ops_intel.opname); \
@@ -499,7 +498,7 @@ MODULE_VERSION(vmm, 1);
static void
vm_init(struct vm *vm, bool create)
{
- vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
+ vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
vm->iommu = NULL;
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
@@ -563,9 +562,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid)
}
void
-vm_slock_vcpus(struct vm *vm)
+vm_lock_vcpus(struct vm *vm)
{
- sx_slock(&vm->vcpus_init_lock);
+ sx_xlock(&vm->vcpus_init_lock);
}
void
@@ -584,7 +583,7 @@ int
vm_create(const char *name, struct vm **retvm)
{
struct vm *vm;
- struct vmspace *vmspace;
+ int error;
/*
* If vmm.ko could not be successfully initialized then don't attempt
@@ -597,14 +596,13 @@ vm_create(const char *name, struct vm **retvm)
VM_MAX_NAMELEN + 1)
return (EINVAL);
- vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48);
- if (vmspace == NULL)
- return (ENOMEM);
-
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+ error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48);
+ if (error != 0) {
+ free(vm, M_VM);
+ return (error);
+ }
strcpy(vm->name, name);
- vm->vmspace = vmspace;
- vm_mem_init(&vm->mem);
mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
sx_init(&vm->vcpus_init_lock, "vm vcpus");
vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK |
@@ -685,9 +683,6 @@ vm_cleanup(struct vm *vm, bool destroy)
if (destroy) {
vm_mem_destroy(vm);
- vmmops_vmspace_free(vm->vmspace);
- vm->vmspace = NULL;
-
free(vm->vcpu, M_VM);
sx_destroy(&vm->vcpus_init_lock);
mtx_destroy(&vm->rendezvous_mtx);
@@ -731,7 +726,7 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
vm_object_t obj;
- if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+ if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL)
return (ENOMEM);
else
return (0);
@@ -741,19 +736,21 @@ int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
- vmm_mmio_free(vm->vmspace, gpa, len);
+ vmm_mmio_free(vm_vmspace(vm), gpa, len);
return (0);
}
static int
vm_iommu_map(struct vm *vm)
{
+ pmap_t pmap;
vm_paddr_t gpa, hpa;
struct vm_mem_map *mm;
int error, i;
sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED);
+ pmap = vmspace_pmap(vm_vmspace(vm));
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
if (!vm_memseg_sysmem(vm, i))
continue;
@@ -767,7 +764,7 @@ vm_iommu_map(struct vm *vm)
mm->flags |= VM_MEMMAP_F_IOMMU;
for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) {
- hpa = pmap_extract(vmspace_pmap(vm->vmspace), gpa);
+ hpa = pmap_extract(pmap, gpa);
/*
* All mappings in the vmm vmspace must be
@@ -816,7 +813,7 @@ vm_iommu_unmap(struct vm *vm)
for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) {
KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract(
- vmspace_pmap(vm->vmspace), gpa))),
+ vmspace_pmap(vm_vmspace(vm)), gpa))),
("vm_iommu_unmap: vm %p gpa %jx not wired",
vm, (uintmax_t)gpa));
iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE);
@@ -873,7 +870,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
int
vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
{
-
+ /* Negative values represent VM control structure fields. */
if (reg >= VM_REG_LAST)
return (EINVAL);
@@ -885,6 +882,7 @@ vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
{
int error;
+ /* Negative values represent VM control structure fields. */
if (reg >= VM_REG_LAST)
return (EINVAL);
@@ -993,6 +991,54 @@ save_guest_fpustate(struct vcpu *vcpu)
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+/*
+ * Invoke the rendezvous function on the specified vcpu if applicable. Return
+ * true if the rendezvous is finished, false otherwise.
+ */
+static bool
+vm_rendezvous(struct vcpu *vcpu)
+{
+ struct vm *vm = vcpu->vm;
+ int vcpuid;
+
+ mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED);
+ KASSERT(vcpu->vm->rendezvous_func != NULL,
+ ("vm_rendezvous: no rendezvous pending"));
+
+ /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
+ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus,
+ &vm->active_cpus);
+
+ vcpuid = vcpu->vcpuid;
+ if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
+ !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
+ VMM_CTR0(vcpu, "Calling rendezvous func");
+ (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
+ CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
+ }
+ if (CPU_CMP(&vm->rendezvous_req_cpus,
+ &vm->rendezvous_done_cpus) == 0) {
+ VMM_CTR0(vcpu, "Rendezvous completed");
+ CPU_ZERO(&vm->rendezvous_req_cpus);
+ vm->rendezvous_func = NULL;
+ wakeup(&vm->rendezvous_func);
+ return (true);
+ }
+ return (false);
+}
+
+static void
+vcpu_wait_idle(struct vcpu *vcpu)
+{
+ KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle"));
+
+ vcpu->reqidle = 1;
+ vcpu_notify_event_locked(vcpu, false);
+ VMM_CTR1(vcpu, "vcpu state change from %s to "
+ "idle requested", vcpu_state2str(vcpu->state));
+ msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+}
+
static int
vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
bool from_idle)
@@ -1007,13 +1053,8 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
* ioctl() operating on a vcpu at any point.
*/
if (from_idle) {
- while (vcpu->state != VCPU_IDLE) {
- vcpu->reqidle = 1;
- vcpu_notify_event_locked(vcpu, false);
- VMM_CTR1(vcpu, "vcpu state change from %s to "
- "idle requested", vcpu_state2str(vcpu->state));
- msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
- }
+ while (vcpu->state != VCPU_IDLE)
+ vcpu_wait_idle(vcpu);
} else {
KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
"vcpu idle state"));
@@ -1065,6 +1106,95 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
return (0);
}
+/*
+ * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks
+ * with vm_smp_rendezvous().
+ *
+ * The complexity here suggests that the rendezvous mechanism needs a rethink.
+ */
+int
+vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
+{
+ cpuset_t locked;
+ struct vcpu *vcpu;
+ int error, i;
+ uint16_t maxcpus;
+
+ KASSERT(newstate != VCPU_IDLE,
+ ("vcpu_set_state_all: invalid target state %d", newstate));
+
+ error = 0;
+ CPU_ZERO(&locked);
+ maxcpus = vm->maxcpus;
+
+ mtx_lock(&vm->rendezvous_mtx);
+restart:
+ if (vm->rendezvous_func != NULL) {
+ /*
+ * If we have a pending rendezvous, then the initiator may be
+ * blocked waiting for other vCPUs to execute the callback. The
+ * current thread may be a vCPU thread so we must not block
+ * waiting for the initiator, otherwise we get a deadlock.
+ * Thus, execute the callback on behalf of any idle vCPUs.
+ */
+ for (i = 0; i < maxcpus; i++) {
+ vcpu = vm_vcpu(vm, i);
+ if (vcpu == NULL)
+ continue;
+ vcpu_lock(vcpu);
+ if (vcpu->state == VCPU_IDLE) {
+ (void)vcpu_set_state_locked(vcpu, VCPU_FROZEN,
+ true);
+ CPU_SET(i, &locked);
+ }
+ if (CPU_ISSET(i, &locked)) {
+ /*
+ * We can safely execute the callback on this
+ * vCPU's behalf.
+ */
+ vcpu_unlock(vcpu);
+ (void)vm_rendezvous(vcpu);
+ vcpu_lock(vcpu);
+ }
+ vcpu_unlock(vcpu);
+ }
+ }
+
+ /*
+ * Now wait for remaining vCPUs to become idle. This may include the
+ * initiator of a rendezvous that is currently blocked on the rendezvous
+ * mutex.
+ */
+ CPU_FOREACH_ISCLR(i, &locked) {
+ if (i >= maxcpus)
+ break;
+ vcpu = vm_vcpu(vm, i);
+ if (vcpu == NULL)
+ continue;
+ vcpu_lock(vcpu);
+ while (vcpu->state != VCPU_IDLE) {
+ mtx_unlock(&vm->rendezvous_mtx);
+ vcpu_wait_idle(vcpu);
+ vcpu_unlock(vcpu);
+ mtx_lock(&vm->rendezvous_mtx);
+ if (vm->rendezvous_func != NULL)
+ goto restart;
+ vcpu_lock(vcpu);
+ }
+ error = vcpu_set_state_locked(vcpu, newstate, true);
+ vcpu_unlock(vcpu);
+ if (error != 0) {
+ /* Roll back state changes. */
+ CPU_FOREACH_ISSET(i, &locked)
+ (void)vcpu_set_state(vcpu, VCPU_IDLE, false);
+ break;
+ }
+ CPU_SET(i, &locked);
+ }
+ mtx_unlock(&vm->rendezvous_mtx);
+ return (error);
+}
+
static void
vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
{
@@ -1086,36 +1216,23 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
static int
vm_handle_rendezvous(struct vcpu *vcpu)
{
- struct vm *vm = vcpu->vm;
+ struct vm *vm;
struct thread *td;
- int error, vcpuid;
- error = 0;
- vcpuid = vcpu->vcpuid;
td = curthread;
+ vm = vcpu->vm;
+
mtx_lock(&vm->rendezvous_mtx);
while (vm->rendezvous_func != NULL) {
- /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
- CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus);
-
- if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
- !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
- VMM_CTR0(vcpu, "Calling rendezvous func");
- (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
- CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
- }
- if (CPU_CMP(&vm->rendezvous_req_cpus,
- &vm->rendezvous_done_cpus) == 0) {
- VMM_CTR0(vcpu, "Rendezvous completed");
- CPU_ZERO(&vm->rendezvous_req_cpus);
- vm->rendezvous_func = NULL;
- wakeup(&vm->rendezvous_func);
+ if (vm_rendezvous(vcpu))
break;
- }
+
VMM_CTR0(vcpu, "Wait for rendezvous completion");
mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
"vmrndv", hz);
if (td_ast_pending(td, TDA_SUSPEND)) {
+ int error;
+
mtx_unlock(&vm->rendezvous_mtx);
error = thread_check_susp(td, true);
if (error != 0)
@@ -1249,7 +1366,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu)
("vm_handle_paging: invalid fault_type %d", ftype));
if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
- rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+ rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm_vmspace(vm)),
vme->u.paging.gpa, ftype);
if (rv == 0) {
VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx",
@@ -1259,7 +1376,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu)
}
}
- map = &vm->vmspace->vm_map;
+ map = &vm_vmspace(vm)->vm_map;
rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, "
@@ -1560,7 +1677,7 @@ vm_run(struct vcpu *vcpu)
if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
return (EINVAL);
- pmap = vmspace_pmap(vm->vmspace);
+ pmap = vmspace_pmap(vm_vmspace(vm));
vme = &vcpu->exitinfo;
evinfo.rptr = &vm->rendezvous_req_cpus;
evinfo.sptr = &vm->suspend;
@@ -2302,12 +2419,6 @@ vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr)
vcpu_unlock(vcpu);
}
-struct vmspace *
-vm_vmspace(struct vm *vm)
-{
- return (vm->vmspace);
-}
-
struct vm_mem *
vm_mem(struct vm *vm)
{
@@ -2519,7 +2630,7 @@ vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat)
if (vcpu->vcpuid == 0) {
vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE *
- vmspace_resident_count(vcpu->vm->vmspace));
+ vmspace_resident_count(vm_vmspace(vcpu->vm)));
}
}
@@ -2529,7 +2640,7 @@ vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat)
if (vcpu->vcpuid == 0) {
vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE *
- pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace)));
+ pmap_wired_count(vmspace_pmap(vm_vmspace(vcpu->vm))));
}
}
diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c
index d8d2b460404c..b84be809ea24 100644
--- a/sys/amd64/vmm/vmm_dev_machdep.c
+++ b/sys/amd64/vmm/vmm_dev_machdep.c
@@ -48,6 +48,7 @@
#include <x86/apicreg.h>
#include <dev/vmm/vmm_dev.h>
+#include <dev/vmm/vmm_mem.h>
#include <dev/vmm/vmm_stat.h>
#include "vmm_lapic.h"
@@ -123,12 +124,16 @@ const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = {
VMMDEV_IOCTL(VM_SET_KERNEMU_DEV, VMMDEV_IOCTL_LOCK_ONE_VCPU),
VMMDEV_IOCTL(VM_BIND_PPTDEV,
- VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+ VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+ VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
VMMDEV_IOCTL(VM_UNBIND_PPTDEV,
- VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+ VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+ VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
- VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS),
- VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+ VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+ VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
+ VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+ VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
#ifdef BHYVE_SNAPSHOT
#ifdef COMPAT_FREEBSD13
VMMDEV_IOCTL(VM_SNAPSHOT_REQ_13, VMMDEV_IOCTL_LOCK_ALL_VCPUS),
@@ -146,9 +151,9 @@ const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = {
VMMDEV_IOCTL(VM_LAPIC_LOCAL_IRQ, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
- VMMDEV_IOCTL(VM_PPTDEV_MSI, 0),
- VMMDEV_IOCTL(VM_PPTDEV_MSIX, 0),
- VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, 0),
+ VMMDEV_IOCTL(VM_PPTDEV_MSI, VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
+ VMMDEV_IOCTL(VM_PPTDEV_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
+ VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
VMMDEV_IOCTL(VM_LAPIC_MSI, 0),
VMMDEV_IOCTL(VM_IOAPIC_ASSERT_IRQ, 0),
VMMDEV_IOCTL(VM_IOAPIC_DEASSERT_IRQ, 0),
@@ -171,40 +176,13 @@ int
vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
int fflag, struct thread *td)
{
- struct vm_seg_desc *vmsegdesc;
- struct vm_run *vmrun;
-#ifdef COMPAT_FREEBSD13
- struct vm_run_13 *vmrun_13;
-#endif
- struct vm_exception *vmexc;
- struct vm_lapic_irq *vmirq;
- struct vm_lapic_msi *vmmsi;
- struct vm_ioapic_irq *ioapic_irq;
- struct vm_isa_irq *isa_irq;
- struct vm_isa_irq_trigger *isa_irq_trigger;
- struct vm_pptdev *pptdev;
- struct vm_pptdev_mmio *pptmmio;
- struct vm_pptdev_msi *pptmsi;
- struct vm_pptdev_msix *pptmsix;
- struct vm_x2apic *x2apic;
- struct vm_gpa_pte *gpapte;
- struct vm_gla2gpa *gg;
- struct vm_intinfo *vmii;
- struct vm_rtc_time *rtctime;
- struct vm_rtc_data *rtcdata;
- struct vm_readwrite_kernemu_device *kernemu;
-#ifdef BHYVE_SNAPSHOT
- struct vm_snapshot_meta *snapshot_meta;
-#ifdef COMPAT_FREEBSD13
- struct vm_snapshot_meta_13 *snapshot_13;
-#endif
-#endif
int error;
error = 0;
switch (cmd) {
case VM_RUN: {
struct vm_exit *vme;
+ struct vm_run *vmrun;
vmrun = (struct vm_run *)data;
vme = vm_exitinfo(vcpu);
@@ -242,6 +220,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
case VM_RUN_13: {
struct vm_exit *vme;
struct vm_exit_13 *vme_13;
+ struct vm_run_13 *vmrun_13;
vmrun_13 = (struct vm_run_13 *)data;
vme_13 = &vmrun_13->vm_exit;
@@ -280,85 +259,123 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
break;
}
#endif
- case VM_PPTDEV_MSI:
+ case VM_PPTDEV_MSI: {
+ struct vm_pptdev_msi *pptmsi;
+
pptmsi = (struct vm_pptdev_msi *)data;
- error = ppt_setup_msi(vm,
- pptmsi->bus, pptmsi->slot, pptmsi->func,
- pptmsi->addr, pptmsi->msg,
- pptmsi->numvec);
+ error = ppt_setup_msi(vm, pptmsi->bus, pptmsi->slot,
+ pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec);
break;
- case VM_PPTDEV_MSIX:
+ }
+ case VM_PPTDEV_MSIX: {
+ struct vm_pptdev_msix *pptmsix;
+
pptmsix = (struct vm_pptdev_msix *)data;
- error = ppt_setup_msix(vm,
- pptmsix->bus, pptmsix->slot,
- pptmsix->func, pptmsix->idx,
- pptmsix->addr, pptmsix->msg,
- pptmsix->vector_control);
+ error = ppt_setup_msix(vm, pptmsix->bus, pptmsix->slot,
+ pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg,
+ pptmsix->vector_control);
break;
- case VM_PPTDEV_DISABLE_MSIX:
+ }
+ case VM_PPTDEV_DISABLE_MSIX: {
+ struct vm_pptdev *pptdev;
+
pptdev = (struct vm_pptdev *)data;
error = ppt_disable_msix(vm, pptdev->bus, pptdev->slot,
- pptdev->func);
+ pptdev->func);
break;
- case VM_MAP_PPTDEV_MMIO:
+ }
+ case VM_MAP_PPTDEV_MMIO: {
+ struct vm_pptdev_mmio *pptmmio;
+
pptmmio = (struct vm_pptdev_mmio *)data;
error = ppt_map_mmio(vm, pptmmio->bus, pptmmio->slot,
- pptmmio->func, pptmmio->gpa, pptmmio->len,
- pptmmio->hpa);
+ pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa);
break;
- case VM_UNMAP_PPTDEV_MMIO:
+ }
+ case VM_UNMAP_PPTDEV_MMIO: {
+ struct vm_pptdev_mmio *pptmmio;
+
pptmmio = (struct vm_pptdev_mmio *)data;
error = ppt_unmap_mmio(vm, pptmmio->bus, pptmmio->slot,
- pptmmio->func, pptmmio->gpa, pptmmio->len);
+ pptmmio->func, pptmmio->gpa, pptmmio->len);
break;
- case VM_BIND_PPTDEV:
+ }
+ case VM_BIND_PPTDEV: {
+ struct vm_pptdev *pptdev;
+
pptdev = (struct vm_pptdev *)data;
error = vm_assign_pptdev(vm, pptdev->bus, pptdev->slot,
- pptdev->func);
+ pptdev->func);
break;
- case VM_UNBIND_PPTDEV:
+ }
+ case VM_UNBIND_PPTDEV: {
+ struct vm_pptdev *pptdev;
+
pptdev = (struct vm_pptdev *)data;
error = vm_unassign_pptdev(vm, pptdev->bus, pptdev->slot,
- pptdev->func);
+ pptdev->func);
break;
- case VM_INJECT_EXCEPTION:
+ }
+ case VM_INJECT_EXCEPTION: {
+ struct vm_exception *vmexc;
+
vmexc = (struct vm_exception *)data;
error = vm_inject_exception(vcpu,
vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
vmexc->restart_instruction);
break;
+ }
case VM_INJECT_NMI:
error = vm_inject_nmi(vcpu);
break;
- case VM_LAPIC_IRQ:
+ case VM_LAPIC_IRQ: {
+ struct vm_lapic_irq *vmirq;
+
vmirq = (struct vm_lapic_irq *)data;
error = lapic_intr_edge(vcpu, vmirq->vector);
break;
- case VM_LAPIC_LOCAL_IRQ:
+ }
+ case VM_LAPIC_LOCAL_IRQ: {
+ struct vm_lapic_irq *vmirq;
+
vmirq = (struct vm_lapic_irq *)data;
error = lapic_set_local_intr(vm, vcpu, vmirq->vector);
break;
- case VM_LAPIC_MSI:
+ }
+ case VM_LAPIC_MSI: {
+ struct vm_lapic_msi *vmmsi;
+
vmmsi = (struct vm_lapic_msi *)data;
error = lapic_intr_msi(vm, vmmsi->addr, vmmsi->msg);
break;
- case VM_IOAPIC_ASSERT_IRQ:
+ }
+ case VM_IOAPIC_ASSERT_IRQ: {
+ struct vm_ioapic_irq *ioapic_irq;
+
ioapic_irq = (struct vm_ioapic_irq *)data;
error = vioapic_assert_irq(vm, ioapic_irq->irq);
break;
- case VM_IOAPIC_DEASSERT_IRQ:
+ }
+ case VM_IOAPIC_DEASSERT_IRQ: {
+ struct vm_ioapic_irq *ioapic_irq;
+
ioapic_irq = (struct vm_ioapic_irq *)data;
error = vioapic_deassert_irq(vm, ioapic_irq->irq);
break;
- case VM_IOAPIC_PULSE_IRQ:
+ }
+ case VM_IOAPIC_PULSE_IRQ: {
+ struct vm_ioapic_irq *ioapic_irq;
+
ioapic_irq = (struct vm_ioapic_irq *)data;
error = vioapic_pulse_irq(vm, ioapic_irq->irq);
break;
+ }
case VM_IOAPIC_PINCOUNT:
*(int *)data = vioapic_pincount(vm);
break;
case VM_SET_KERNEMU_DEV:
case VM_GET_KERNEMU_DEV: {
+ struct vm_readwrite_kernemu_device *kernemu;
mem_region_write_t mwrite;
mem_region_read_t mread;
int size;
@@ -395,60 +412,86 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
error = mread(vcpu, kernemu->gpa,
&kernemu->value, size, &arg);
break;
- }
- case VM_ISA_ASSERT_IRQ:
+ }
+ case VM_ISA_ASSERT_IRQ: {
+ struct vm_isa_irq *isa_irq;
+
isa_irq = (struct vm_isa_irq *)data;
error = vatpic_assert_irq(vm, isa_irq->atpic_irq);
if (error == 0 && isa_irq->ioapic_irq != -1)
error = vioapic_assert_irq(vm, isa_irq->ioapic_irq);
break;
- case VM_ISA_DEASSERT_IRQ:
+ }
+ case VM_ISA_DEASSERT_IRQ: {
+ struct vm_isa_irq *isa_irq;
+
isa_irq = (struct vm_isa_irq *)data;
error = vatpic_deassert_irq(vm, isa_irq->atpic_irq);
if (error == 0 && isa_irq->ioapic_irq != -1)
error = vioapic_deassert_irq(vm, isa_irq->ioapic_irq);
break;
- case VM_ISA_PULSE_IRQ:
+ }
+ case VM_ISA_PULSE_IRQ: {
+ struct vm_isa_irq *isa_irq;
+
isa_irq = (struct vm_isa_irq *)data;
error = vatpic_pulse_irq(vm, isa_irq->atpic_irq);
if (error == 0 && isa_irq->ioapic_irq != -1)
error = vioapic_pulse_irq(vm, isa_irq->ioapic_irq);
break;
- case VM_ISA_SET_IRQ_TRIGGER:
+ }
+ case VM_ISA_SET_IRQ_TRIGGER: {
+ struct vm_isa_irq_trigger *isa_irq_trigger;
+
isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
error = vatpic_set_irq_trigger(vm,
isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
break;
- case VM_SET_SEGMENT_DESCRIPTOR:
+ }
+ case VM_SET_SEGMENT_DESCRIPTOR: {
+ struct vm_seg_desc *vmsegdesc;
+
vmsegdesc = (struct vm_seg_desc *)data;
- error = vm_set_seg_desc(vcpu,
- vmsegdesc->regnum,
- &vmsegdesc->desc);
+ error = vm_set_seg_desc(vcpu, vmsegdesc->regnum,
+ &vmsegdesc->desc);
break;
- case VM_GET_SEGMENT_DESCRIPTOR:
+ }
+ case VM_GET_SEGMENT_DESCRIPTOR: {
+ struct vm_seg_desc *vmsegdesc;
+
vmsegdesc = (struct vm_seg_desc *)data;
- error = vm_get_seg_desc(vcpu,
- vmsegdesc->regnum,
- &vmsegdesc->desc);
+ error = vm_get_seg_desc(vcpu, vmsegdesc->regnum,
+ &vmsegdesc->desc);
break;
- case VM_SET_X2APIC_STATE:
+ }
+ case VM_SET_X2APIC_STATE: {
+ struct vm_x2apic *x2apic;
+
x2apic = (struct vm_x2apic *)data;
error = vm_set_x2apic_state(vcpu, x2apic->state);
break;
- case VM_GET_X2APIC_STATE:
+ }
+ case VM_GET_X2APIC_STATE: {
+ struct vm_x2apic *x2apic;
+
x2apic = (struct vm_x2apic *)data;
error = vm_get_x2apic_state(vcpu, &x2apic->state);
break;
- case VM_GET_GPA_PMAP:
+ }
+ case VM_GET_GPA_PMAP: {
+ struct vm_gpa_pte *gpapte;
+
gpapte = (struct vm_gpa_pte *)data;
- pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)),
- gpapte->gpa, gpapte->pte, &gpapte->ptenum);
- error = 0;
+ pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), gpapte->gpa,
+ gpapte->pte, &gpapte->ptenum);
break;
+ }
case VM_GET_HPET_CAPABILITIES:
error = vhpet_getcap((struct vm_hpet_cap *)data);
break;
case VM_GLA2GPA: {
+ struct vm_gla2gpa *gg;
+
CTASSERT(PROT_READ == VM_PROT_READ);
CTASSERT(PROT_WRITE == VM_PROT_WRITE);
CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
@@ -459,50 +502,76 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
("%s: vm_gla2gpa unknown error %d", __func__, error));
break;
}
- case VM_GLA2GPA_NOFAULT:
+ case VM_GLA2GPA_NOFAULT: {
+ struct vm_gla2gpa *gg;
+
gg = (struct vm_gla2gpa *)data;
error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
gg->prot, &gg->gpa, &gg->fault);
KASSERT(error == 0 || error == EFAULT,
("%s: vm_gla2gpa unknown error %d", __func__, error));
break;
- case VM_SET_INTINFO:
+ }
+ case VM_SET_INTINFO: {
+ struct vm_intinfo *vmii;
+
vmii = (struct vm_intinfo *)data;
error = vm_exit_intinfo(vcpu, vmii->info1);
break;
- case VM_GET_INTINFO:
+ }
+ case VM_GET_INTINFO: {
+ struct vm_intinfo *vmii;
+
vmii = (struct vm_intinfo *)data;
error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
break;
- case VM_RTC_WRITE:
+ }
+ case VM_RTC_WRITE: {
+ struct vm_rtc_data *rtcdata;
+
rtcdata = (struct vm_rtc_data *)data;
error = vrtc_nvram_write(vm, rtcdata->offset,
rtcdata->value);
break;
- case VM_RTC_READ:
+ }
+ case VM_RTC_READ: {
+ struct vm_rtc_data *rtcdata;
+
rtcdata = (struct vm_rtc_data *)data;
error = vrtc_nvram_read(vm, rtcdata->offset,
&rtcdata->value);
break;
- case VM_RTC_SETTIME:
+ }
+ case VM_RTC_SETTIME: {
+ struct vm_rtc_time *rtctime;
+
rtctime = (struct vm_rtc_time *)data;
error = vrtc_set_time(vm, rtctime->secs);
break;
- case VM_RTC_GETTIME:
- error = 0;
+ }
+ case VM_RTC_GETTIME: {
+ struct vm_rtc_time *rtctime;
+
rtctime = (struct vm_rtc_time *)data;
rtctime->secs = vrtc_get_time(vm);
break;
+ }
case VM_RESTART_INSTRUCTION:
error = vm_restart_instruction(vcpu);
break;
#ifdef BHYVE_SNAPSHOT
- case VM_SNAPSHOT_REQ:
+ case VM_SNAPSHOT_REQ: {
+ struct vm_snapshot_meta *snapshot_meta;
+
snapshot_meta = (struct vm_snapshot_meta *)data;
error = vm_snapshot_req(vm, snapshot_meta);
break;
+ }
#ifdef COMPAT_FREEBSD13
- case VM_SNAPSHOT_REQ_13:
+ case VM_SNAPSHOT_REQ_13: {
+ struct vm_snapshot_meta *snapshot_meta;
+ struct vm_snapshot_meta_13 *snapshot_13;
+
/*
* The old structure just has an additional pointer at
* the start that is ignored.
@@ -512,6 +581,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
(struct vm_snapshot_meta *)&snapshot_13->dev_data;
error = vm_snapshot_req(vm, snapshot_meta);
break;
+ }
#endif
case VM_RESTORE_TIME:
error = vm_restore_time(vm);