diff options
author | Andrew Turner <andrew@FreeBSD.org> | 2024-01-09 15:22:27 +0000 |
---|---|---|
committer | Andrew Turner <andrew@FreeBSD.org> | 2024-02-21 18:55:32 +0000 |
commit | 47e073941f4e7ca6e9bde3fa65abbfcfed6bfa2b (patch) | |
tree | 12952882180198ce097077e4b86efa2440229635 | |
parent | 0f4071978e3dae6637d4988212661164115f6be8 (diff) | |
download | src-47e073941f4e7ca6e9bde3fa65abbfcfed6bfa2b.tar.gz src-47e073941f4e7ca6e9bde3fa65abbfcfed6bfa2b.zip |
Import the kernel parts of bhyve/arm64
To support virtual machines on arm64 add the vmm code. This is based on
earlier work by Mihai Carabas and Alexandru Elisei at University
Politehnica of Bucharest, with further work by myself and Mark Johnston.
All AArch64 CPUs should work, however only the GICv3 interrupt
controller is supported. There is initial support to allow the GICv2
to be supported in the future. Only pure Armv8.0 virtualisation is
supported, the Virtualization Host Extensions are not currently used.
With a separate userspace patch and U-Boot port FreeBSD guests are able
to boot to multiuser mode, and the hypervisor can be tested with the
kvm unit tests. Linux partially boots, but hangs before entering
userspace. Other operating systems are untested.
Sponsored by: Arm Ltd
Sponsored by: Innovate UK
Sponsored by: The FreeBSD Foundation
Sponsored by: University Politehnica of Bucharest
Differential Revision: https://reviews.freebsd.org/D37428
34 files changed, 11066 insertions, 13 deletions
diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h new file mode 100644 index 000000000000..8e2c9c868635 --- /dev/null +++ b/sys/arm64/include/vmm.h @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include <sys/param.h> +#include <sys/cpuset.h> +#include <vm/vm.h> +#include <vm/pmap.h> + +#include "pte.h" +#include "pmap.h" + +struct vcpu; + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_X0 = 0, + VM_REG_GUEST_X1, + VM_REG_GUEST_X2, + VM_REG_GUEST_X3, + VM_REG_GUEST_X4, + VM_REG_GUEST_X5, + VM_REG_GUEST_X6, + VM_REG_GUEST_X7, + VM_REG_GUEST_X8, + VM_REG_GUEST_X9, + VM_REG_GUEST_X10, + VM_REG_GUEST_X11, + VM_REG_GUEST_X12, + VM_REG_GUEST_X13, + VM_REG_GUEST_X14, + VM_REG_GUEST_X15, + VM_REG_GUEST_X16, + VM_REG_GUEST_X17, + VM_REG_GUEST_X18, + VM_REG_GUEST_X19, + VM_REG_GUEST_X20, + VM_REG_GUEST_X21, + VM_REG_GUEST_X22, + VM_REG_GUEST_X23, + VM_REG_GUEST_X24, + VM_REG_GUEST_X25, + VM_REG_GUEST_X26, + VM_REG_GUEST_X27, + VM_REG_GUEST_X28, + VM_REG_GUEST_X29, + VM_REG_GUEST_LR, + VM_REG_GUEST_SP, + VM_REG_GUEST_PC, + VM_REG_GUEST_CPSR, + + VM_REG_GUEST_SCTLR_EL1, + VM_REG_GUEST_TTBR0_EL1, + VM_REG_GUEST_TTBR1_EL1, + VM_REG_GUEST_TCR_EL1, + VM_REG_GUEST_TCR2_EL1, + VM_REG_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#define VM_MAX_SUFFIXLEN 15 + +#define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_exception; +struct vm_exit; +struct vm_run; +struct vm_object; +struct vm_guest_paging; +struct vm_vgic_descr; +struct pmap; + +struct vm_eventinfo { + void *rptr; /* rendezvous cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +int vm_create(const char *name, struct vm **retvm); +struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); +void vm_slock_vcpus(struct vm *vm); +void vm_unlock_vcpus(struct vm *vm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +void vm_slock_memsegs(struct vm *vm); +void vm_xlock_memsegs(struct vm *vm); +void vm_unlock_memsegs(struct vm *vm); +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void vm_gpa_release(void *cookie); +bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa); + +int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); +int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); +int vm_run(struct vcpu *vcpu); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +void* vm_get_cookie(struct vm *vm); +int vcpu_vcpuid(struct vcpu *vcpu); +void *vcpu_get_cookie(struct vcpu *vcpu); +struct vm *vcpu_vm(struct vcpu *vcpu); +struct vcpu *vm_vcpu(struct vm *vm, int cpu); +int vm_get_capability(struct vcpu *vcpu, int type, int *val); +int vm_set_capability(struct vcpu *vcpu, int type, int val); +int vm_activate_cpu(struct vcpu *vcpu); +int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); +int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); +int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far); +int vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr); +int vm_assert_irq(struct vm *vm, uint32_t irq); +int vm_deassert_irq(struct vm *vm, uint32_t irq); +int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, + int func); +struct vm_exit *vm_exitinfo(struct vcpu *vcpu); +void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc); +void vm_exit_debug(struct vcpu *vcpu, uint64_t pc); +void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t pc); +void vm_exit_astpending(struct vcpu *vcpu, uint64_t pc); + +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); + +static __inline bool +virt_enabled(void) +{ + + return (has_hyp()); +} + +static __inline int +vcpu_rendezvous_pending(struct vm_eventinfo *info) +{ + + return (*((uintptr_t *)(info->rptr)) != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +int vcpu_debugged(struct vcpu *vcpu); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); +enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vcpu *vcpu, int *hostcpu) +{ + return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vcpu *vcpu) +{ + struct thread *td; + + td = curthread; + return (td->td_ast != 0 || td->td_owepreempt != 0); +} +#endif + +void *vcpu_stats(struct vcpu *vcpu); +void vcpu_notify_event(struct vcpu *vcpu); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +#endif /* _KERNEL */ + +#define VM_DIR_READ 0 +#define VM_DIR_WRITE 1 + +#define VM_GP_M_MASK 0x1f +#define VM_GP_MMU_ENABLED (1 << 5) + +struct vm_guest_paging { + uint64_t ttbr0_addr; + uint64_t ttbr1_addr; + uint64_t tcr_el1; + uint64_t tcr2_el1; + int flags; + int padding; +}; + +struct vie { + uint8_t access_size:4, sign_extend:1, dir:1, unused:2; + enum vm_reg_name reg; +}; + +struct vre { + uint32_t inst_syndrome; + uint8_t dir:1, unused:7; + enum vm_reg_name reg; +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; + +enum vm_exitcode { + VM_EXITCODE_BOGUS, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_REG_EMUL, + VM_EXITCODE_HVC, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_HYP, + VM_EXITCODE_WFI, + VM_EXITCODE_PAGING, + VM_EXITCODE_SMCCC, + VM_EXITCODE_DEBUG, + VM_EXITCODE_MAX +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; + uint64_t pc; + union { + /* + * ARM specific payload. + */ + struct { + uint32_t exception_nr; + uint32_t pad; + uint64_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } hyp; + struct { + struct vre vre; + } reg_emul; + struct { + uint64_t gpa; + uint64_t esr; + } paging; + struct { + uint64_t gpa; + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + + /* + * A SMCCC call, e.g. starting a core via PSCI. + * Further arguments can be read by asking the kernel for + * all register values. + */ + struct { + uint64_t func_id; + uint64_t args[7]; + } smccc_call; + + struct { + enum vm_suspend_how how; + } suspended; + } u; +}; + +#endif /* _VMM_H_ */ diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h new file mode 100644 index 000000000000..9e229665a71e --- /dev/null +++ b/sys/arm64/include/vmm_dev.h @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 + +struct vm_munmap { + vm_paddr_t gpa; + size_t len; +}; + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; + size_t len; + char name[VM_MAX_SUFFIXLEN + 1]; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + +struct vm_run { + int cpuid; + cpuset_t *cpuset; /* CPU set storage */ + size_t cpusetsize; + struct vm_exit *vm_exit; +}; + +struct vm_exception { + int cpuid; + uint64_t esr; + uint64_t far; +}; + +struct vm_msi { + uint64_t msg; + uint64_t addr; + int bus; + int slot; + int func; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int index; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; + cpuset_t *cpus; +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_vgic_version { + u_int version; + u_int flags; +}; + +struct vm_vgic_descr { + struct vm_vgic_version ver; + union { + struct { + uint64_t dist_start; + uint64_t dist_size; + uint64_t redist_start; + uint64_t redist_size; + } v3_regs; + }; +}; + +struct vm_irq { + uint32_t irq; +}; + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA_NOFAULT = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + IOCNUM_MUNMAP_MEMSEG = 18, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + + /* interrupt injection */ + IOCNUM_ASSERT_IRQ = 80, + IOCNUM_DEASSERT_IRQ = 81, + IOCNUM_RAISE_MSI = 82, + IOCNUM_INJECT_EXCEPTION = 83, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* vm_attach_vgic */ + IOCNUM_GET_VGIC_VERSION = 110, + IOCNUM_ATTACH_VGIC = 111, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) +#define VM_MUNMAP_MEMSEG \ + _IOW('v', IOCNUM_MUNMAP_MEMSEG, struct vm_munmap) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_ASSERT_IRQ \ + _IOW('v', IOCNUM_ASSERT_IRQ, struct vm_irq) +#define VM_DEASSERT_IRQ \ + _IOW('v', IOCNUM_DEASSERT_IRQ, struct vm_irq) +#define VM_RAISE_MSI \ + _IOW('v', IOCNUM_RAISE_MSI, struct vm_msi) +#define VM_INJECT_EXCEPTION \ + _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GLA2GPA_NOFAULT \ + _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_GET_VGIC_VERSION \ + _IOR('v', IOCNUM_GET_VGIC_VERSION, struct vm_vgic_version) +#define VM_ATTACH_VGIC \ + _IOW('v', IOCNUM_ATTACH_VGIC, struct vm_vgic_descr) +#endif diff --git a/sys/arm64/include/vmm_instruction_emul.h b/sys/arm64/include/vmm_instruction_emul.h new file mode 100644 index 000000000000..a295f7cce127 --- /dev/null +++ b/sys/arm64/include/vmm_instruction_emul.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(struct vcpu *vcpu, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); +typedef int (*mem_region_write_t)(struct vcpu *vcpu, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Callback functions to read and write registers. + */ +typedef int (*reg_read_t)(struct vcpu *vcpu, uint64_t *rval, void *arg); +typedef int (*reg_write_t)(struct vcpu *vcpu, uint64_t wval, void *arg); + +/* + * Emulate the decoded 'vie' instruction when it contains a memory operation. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +/* + * Emulate the decoded 'vre' instruction when it contains a register access. + * + * The callbacks 'regread' and 'regwrite' emulate reads and writes to the + * register from 'vie'. 'regarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_register(struct vcpu *vcpu, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg); + +#ifdef _KERNEL +void vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask, + reg_read_t reg_read, reg_write_t reg_write, void *arg); +void vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask); + +void vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, + mem_region_read_t mmio_read, mem_region_write_t mmio_write); +void vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size); +#endif + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/arm64/include/vmm_snapshot.h b/sys/arm64/include/vmm_snapshot.h new file mode 100644 index 000000000000..da23dbe43a4f --- /dev/null +++ b/sys/arm64/include/vmm_snapshot.h @@ -0,0 +1 @@ +/* $FreeBSD$ */ diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h new file mode 100644 index 000000000000..43459d14e143 --- /dev/null +++ b/sys/arm64/vmm/arm64.h @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_ARM64_H_ +#define _VMM_ARM64_H_ + +#include <machine/reg.h> +#include <machine/hypervisor.h> +#include <machine/pcpu.h> + +#include "mmu.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +struct vgic_v3; +struct vgic_v3_cpu; + +struct hypctx { + struct trapframe tf; + + /* + * EL1 control registers. + */ + uint64_t elr_el1; /* Exception Link Register */ + uint64_t sp_el0; /* Stack pointer */ + uint64_t tpidr_el0; /* EL0 Software ID Register */ + uint64_t tpidrro_el0; /* Read-only Thread ID Register */ + uint64_t tpidr_el1; /* EL1 Software ID Register */ + uint64_t vbar_el1; /* Vector Base Address Register */ + + uint64_t actlr_el1; /* Auxiliary Control Register */ + uint64_t afsr0_el1; /* Auxiliary Fault Status Register 0 */ + uint64_t afsr1_el1; /* Auxiliary Fault Status Register 1 */ + uint64_t amair_el1; /* Auxiliary Memory Attribute Indirection Register */ + uint64_t contextidr_el1; /* Current Process Identifier */ + uint64_t cpacr_el1; /* Architectural Feature Access Control Register */ + uint64_t csselr_el1; /* Cache Size Selection Register */ + uint64_t esr_el1; /* Exception Syndrome Register */ + uint64_t far_el1; /* Fault Address Register */ + uint64_t mair_el1; /* Memory Attribute Indirection Register */ + uint64_t mdccint_el1; /* Monitor DCC Interrupt Enable Register */ + uint64_t mdscr_el1; /* Monitor Debug System Control Register */ + uint64_t par_el1; /* Physical Address Register */ + uint64_t sctlr_el1; /* System Control Register */ + uint64_t tcr_el1; /* Translation Control Register */ + uint64_t tcr2_el1; /* Translation Control Register 2 */ + uint64_t ttbr0_el1; /* Translation Table Base Register 0 */ + uint64_t ttbr1_el1; /* Translation Table Base Register 1 */ + uint64_t spsr_el1; /* Saved Program Status Register */ + + uint64_t pmcr_el0; /* Performance Monitors Control Register */ + uint64_t pmccntr_el0; + uint64_t pmccfiltr_el0; + uint64_t pmcntenset_el0; + uint64_t pmintenset_el1; + uint64_t pmovsset_el0; + uint64_t pmselr_el0; + uint64_t pmuserenr_el0; + uint64_t pmevcntr_el0[31]; + uint64_t pmevtyper_el0[31]; + + uint64_t dbgbcr_el1[16]; /* Debug Breakpoint Control Registers */ + uint64_t dbgbvr_el1[16]; /* Debug Breakpoint Value Registers */ + uint64_t dbgwcr_el1[16]; /* Debug Watchpoint Control Registers */ + uint64_t dbgwvr_el1[16]; /* Debug Watchpoint Value Registers */ + + /* EL2 control registers */ + uint64_t cptr_el2; /* Architectural Feature Trap Register */ + uint64_t hcr_el2; /* Hypervisor Configuration Register */ + uint64_t mdcr_el2; /* Monitor Debug Configuration Register */ + uint64_t vpidr_el2; /* Virtualization Processor ID Register */ + uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ + uint64_t el2_addr; /* The address of this in el2 space */ + struct hyp *hyp; + struct vcpu *vcpu; + struct { + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } exit_info; + + struct vtimer_cpu vtimer_cpu; + + struct vgic_v3_regs vgic_v3_regs; + struct vgic_v3_cpu *vgic_cpu; + bool has_exception; +}; + +struct hyp { + struct vm *vm; + struct vtimer vtimer; + uint64_t vmid_generation; + uint64_t vttbr_el2; + uint64_t el2_addr; /* The address of this in el2 space */ + bool vgic_attached; + struct vgic_v3 *vgic; + struct hypctx *ctx[]; +}; + +#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args; + +DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) +DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) +DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) +DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) +DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)) +DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) +DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)) +DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) +DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)) +DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) +DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) +DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) +DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) +DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)) +DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) +#ifdef notyet +#ifdef BHYVE_SNAPSHOT +DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) +DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)) +DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) +#endif +#endif + +uint64_t vmm_call_hyp(uint64_t, ...); + +#if 0 +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) +#else +#define eprintf(fmt, ...) do {} while(0) +#endif + +struct hypctx *arm64_get_active_vcpu(void); +void raise_data_insn_abort(struct hypctx *, uint64_t, bool, int); + +#endif /* !_VMM_ARM64_H_ */ diff --git a/sys/arm64/vmm/hyp.h b/sys/arm64/vmm/hyp.h new file mode 100644 index 000000000000..0b2977c73960 --- /dev/null +++ b/sys/arm64/vmm/hyp.h @@ -0,0 +1,114 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei <alexandru.elisei@gmail.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_H_ +#define _VMM_HYP_H_ + +/* + * The translation tables for the hypervisor mode will hold mappings for kernel + * virtual addresses and an identity mapping (VA == PA) necessary when + * enabling/disabling the MMU. + * + * When in EL2 exception level the translation table base register is TTBR0_EL2 + * and the virtual addresses generated by the CPU must be at the bottom of the + * memory, with the first 16 bits all set to zero: + * + * 0x0000ffffffffffff End hyp address space + * 0x0000000000000000 Start of hyp address space + * + * To run code in hyp mode we need to convert kernel virtual addresses to + * addreses that fit into this address space. + * + * The kernel virtual address range is: + * + * 0xffff007fffffffff End of KVA + * 0xffff000000000000 Kernel base address & start of KVA + * + * (see /sys/arm64/include/vmparam.h). + * + * We could convert the kernel virtual addresses to valid EL2 addresses by + * setting the first 16 bits to zero and thus mapping the kernel addresses in + * the bottom half of the EL2 address space, but then they might clash with the + * identity mapping addresses. Instead we map the kernel addresses in the upper + * half of the EL2 address space. + * + * The hypervisor address space will look like this: + * + * 0x0000807fffffffff End of KVA mapping + * 0x0000800000000000 Start of KVA mapping + * + * 0x00007fffffffffff End of identity mapping + * 0x0000000000000000 Start of identity mapping + * + * With the scheme we have 47 bits at our disposable for the identity map and + * another 47 bits for the kernel virtual addresses. For a maximum physical + * memory size of 128TB we are guaranteed to not have any clashes between + * addresses. + */ +#define HYP_VM_MIN_ADDRESS 0x0000000000000000 +#define HYP_VM_MAX_ADDRESS 0x0001000000000000 + +/* + * When the vmm code is installed the following handles can be used by + * the host to call into EL2. + */ +#define HYP_CLEANUP 0x00000001 +#define HYP_ENTER_GUEST 0x00000002 +#define HYP_READ_REGISTER 0x00000003 +#define HYP_REG_ICH_VTR 0x1 +#define HYP_REG_CNTHCTL 0x2 +#define HYP_CLEAN_S2_TLBI 0x00000004 +#define HYP_DC_CIVAC 0x00000005 +#define HYP_EL2_TLBI 0x00000006 +#define HYP_EL2_TLBI_ALL 0x1 +#define HYP_EL2_TLBI_VA 0x2 +#define HYP_S2_TLBI_RANGE 0x00000010 +#define HYP_S2_TLBI_ALL 0x00000011 + +/* + * When taking asynchronous exceptions, or interrupts, with the exception of the + * SError interrupt, the exception syndrome register is not updated with the + * exception code. We need to differentiate between the different exception + * types taken to EL2. + */ +#define EXCP_TYPE_EL1_SYNC 0 +#define EXCP_TYPE_EL1_IRQ 1 +#define EXCP_TYPE_EL1_FIQ 2 +#define EXCP_TYPE_EL1_ERROR 3 + +#define EXCP_TYPE_EL2_SYNC 4 +#define EXCP_TYPE_EL2_IRQ 5 +#define EXCP_TYPE_EL2_FIQ 6 +#define EXCP_TYPE_EL2_ERROR 7 + +#define EXCP_TYPE_MAINT_IRQ 8 +/* Used internally in vmm_hyp.c */ +#define EXCP_TYPE_REENTER 9 + +#define HYP_GET_VECTOR_TABLE -1 + +#endif /* !_VMM_HYP_H_ */ diff --git a/sys/arm64/vmm/io/vgic.c b/sys/arm64/vmm/io/vgic.c new file mode 100644 index 000000000000..ee841292ed33 --- /dev/null +++ b/sys/arm64/vmm/io/vgic.c @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/bus.h> + +#include "vgic.h" +#include "vgic_if.h" + +device_t vgic_dev; + +bool +vgic_present(void) +{ + return (vgic_dev != NULL); +} + +void +vgic_init(void) +{ + VGIC_INIT(vgic_dev); +} + +int +vgic_attach_to_vm(struct hyp *hyp, struct vm_vgic_descr *descr) +{ + return (VGIC_ATTACH_TO_VM(vgic_dev, hyp, descr)); +} + +void +vgic_detach_from_vm(struct hyp *hyp) +{ + VGIC_DETACH_FROM_VM(vgic_dev, hyp); +} + +void +vgic_vminit(struct hyp *hyp) +{ + VGIC_VMINIT(vgic_dev, hyp); +} + +void +vgic_cpuinit(struct hypctx *hypctx) +{ + VGIC_CPUINIT(vgic_dev, hypctx); +} + +void +vgic_cpucleanup(struct hypctx *hypctx) +{ + VGIC_CPUCLEANUP(vgic_dev, hypctx); +} + +void +vgic_vmcleanup(struct hyp *hyp) +{ + VGIC_VMCLEANUP(vgic_dev, hyp); +} + +int +vgic_max_cpu_count(struct hyp *hyp) +{ + return (VGIC_MAX_CPU_COUNT(vgic_dev, hyp)); +} + +bool +vgic_has_pending_irq(struct hypctx *hypctx) +{ + return (VGIC_HAS_PENDING_IRQ(vgic_dev, hypctx)); +} + +/* TODO: vcpuid -> hypctx ? */ +/* TODO: Add a vgic interface */ +int +vgic_inject_irq(struct hyp *hyp, int vcpuid, uint32_t irqid, bool level) +{ + return (VGIC_INJECT_IRQ(vgic_dev, hyp, vcpuid, irqid, level)); +} + +int +vgic_inject_msi(struct hyp *hyp, uint64_t msg, uint64_t addr) +{ + return (VGIC_INJECT_MSI(vgic_dev, hyp, msg, addr)); +} + +void +vgic_flush_hwstate(struct hypctx *hypctx) +{ + VGIC_FLUSH_HWSTATE(vgic_dev, hypctx); +} + +void +vgic_sync_hwstate(struct hypctx *hypctx) +{ + VGIC_SYNC_HWSTATE(vgic_dev, hypctx); +} diff --git a/sys/arm64/vmm/io/vgic.h b/sys/arm64/vmm/io/vgic.h new file mode 100644 index 000000000000..b9b9f1e39f9c --- /dev/null +++ b/sys/arm64/vmm/io/vgic.h @@ -0,0 +1,52 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VGIC_H_ +#define _VGIC_H_ + +struct hyp; +struct hypctx; +struct vm_vgic_descr; + +extern device_t vgic_dev; + +bool vgic_present(void); +void vgic_init(void); +int vgic_attach_to_vm(struct hyp *hyp, struct vm_vgic_descr *descr); +void vgic_detach_from_vm(struct hyp *hyp); +void vgic_vminit(struct hyp *hyp); +void vgic_cpuinit(struct hypctx *hypctx); +void vgic_cpucleanup(struct hypctx *hypctx); +void vgic_vmcleanup(struct hyp *hyp); +int vgic_max_cpu_count(struct hyp *hyp); +bool vgic_has_pending_irq(struct hypctx *hypctx); +int vgic_inject_irq(struct hyp *hyp, int vcpuid, uint32_t irqid, bool level); +int vgic_inject_msi(struct hyp *hyp, uint64_t msg, uint64_t addr); +void vgic_flush_hwstate(struct hypctx *hypctx); +void vgic_sync_hwstate(struct hypctx *hypctx); + +#endif /* _VGIC_H_ */ diff --git a/sys/arm64/vmm/io/vgic_if.m b/sys/arm64/vmm/io/vgic_if.m new file mode 100644 index 000000000000..571e133cd74b --- /dev/null +++ b/sys/arm64/vmm/io/vgic_if.m @@ -0,0 +1,104 @@ +#- +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2023 Arm Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +INTERFACE vgic; + +HEADER { + struct hyp; + struct hypctx; + struct vm_vgic_descr; +}; + +METHOD void init { + device_t dev; +} + +METHOD int attach_to_vm { + device_t dev; + struct hyp *hyp; + struct vm_vgic_descr *descr; +}; + +METHOD void detach_from_vm { + device_t dev; + struct hyp *hyp; +} + +METHOD void vminit { + device_t dev; + struct hyp *hyp; +} + +METHOD void cpuinit { + device_t dev; + struct hypctx *hypctx; +} + +METHOD void cpucleanup { + device_t dev; + struct hypctx *hypctx; +} + +METHOD void vmcleanup { + device_t dev; + struct hyp *hyp; +} + +METHOD int max_cpu_count { + device_t dev; + struct hyp *hyp; +} + +METHOD bool has_pending_irq { + device_t dev; + struct hypctx *hypctx; +} + +METHOD int inject_irq { + device_t dev; + struct hyp *hyp; + int vcpuid; + uint32_t irqid; + bool level; +} + +METHOD int inject_msi { + device_t dev; + struct hyp *hyp; + uint64_t msg; + uint64_t addr; +} + +METHOD void flush_hwstate { + device_t dev; + struct hypctx *hypctx; +} + +METHOD void sync_hwstate { + device_t dev; + struct hypctx *hypctx; +} diff --git a/sys/arm64/vmm/io/vgic_v3.c b/sys/arm64/vmm/io/vgic_v3.c new file mode 100644 index 000000000000..7ed591c409ba --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3.c @@ -0,0 +1,2348 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 Alexandru Elisei <alexandru.elisei@gmail.com> + * Copyright (C) 2020-2022 Andrew Turner + * Copyright (C) 2023 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/bitstring.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/rman.h> +#include <sys/smp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/ofw/openfirm.h> + +#include <machine/armreg.h> +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/cpufunc.h> +#include <machine/cpu.h> +#include <machine/machdep.h> +#include <machine/param.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/intr.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> + +#include <arm/arm/gic_common.h> +#include <arm64/arm64/gic_v3_reg.h> +#include <arm64/arm64/gic_v3_var.h> + +#include <arm64/vmm/hyp.h> +#include <arm64/vmm/mmu.h> +#include <arm64/vmm/arm64.h> + +#include "vgic.h" +#include "vgic_v3.h" +#include "vgic_v3_reg.h" + +#include "vgic_if.h" + +#define VGIC_SGI_NUM (GIC_LAST_SGI - GIC_FIRST_SGI + 1) +#define VGIC_PPI_NUM (GIC_LAST_PPI - GIC_FIRST_PPI + 1) +#define VGIC_SPI_NUM (GIC_LAST_SPI - GIC_FIRST_SPI + 1) +#define VGIC_PRV_I_NUM (VGIC_SGI_NUM + VGIC_PPI_NUM) +#define VGIC_SHR_I_NUM (VGIC_SPI_NUM) + +MALLOC_DEFINE(M_VGIC_V3, "ARM VMM VGIC V3", "ARM VMM VGIC V3"); + +/* TODO: Move to softc */ +struct vgic_v3_virt_features { + uint8_t min_prio; + size_t ich_lr_num; + size_t ich_apr_num; +}; + +struct vgic_v3_irq { + /* List of IRQs that are active or pending */ + TAILQ_ENTRY(vgic_v3_irq) act_pend_list; + struct mtx irq_spinmtx; + uint64_t mpidr; + int target_vcpu; + uint32_t irq; + bool active; + bool pending; + bool enabled; + bool level; + bool on_aplist; + uint8_t priority; + uint8_t config; +#define VGIC_CONFIG_MASK 0x2 +#define VGIC_CONFIG_LEVEL 0x0 +#define VGIC_CONFIG_EDGE 0x2 +}; + +/* Global data not needed by EL2 */ +struct vgic_v3 { + struct mtx dist_mtx; + uint64_t dist_start; + size_t dist_end; + + uint64_t redist_start; + size_t redist_end; + + uint32_t gicd_ctlr; /* Distributor Control Register */ + + struct vgic_v3_irq *irqs; +}; + +/* Per-CPU data not needed by EL2 */ +struct vgic_v3_cpu { + /* + * We need a mutex for accessing the list registers because they are + * modified asynchronously by the virtual timer. + * + * Note that the mutex *MUST* be a spin mutex because an interrupt can + * be injected by a callout callback function, thereby modifying the + * list registers from a context where sleeping is forbidden. + */ + struct mtx lr_mtx; + + struct vgic_v3_irq private_irqs[VGIC_PRV_I_NUM]; + TAILQ_HEAD(, vgic_v3_irq) irq_act_pend; + u_int ich_lr_used; +}; + +/* How many IRQs we support (SGIs + PPIs + SPIs). Not including LPIs */ +#define VGIC_NIRQS 1023 +/* Pretend to be an Arm design */ +#define VGIC_IIDR 0x43b + +static vgic_inject_irq_t vgic_v3_inject_irq; +static vgic_inject_msi_t vgic_v3_inject_msi; + +static int vgic_v3_max_cpu_count(device_t dev, struct hyp *hyp); + +#define INJECT_IRQ(hyp, vcpuid, irqid, level) \ + vgic_v3_inject_irq(NULL, (hyp), (vcpuid), (irqid), (level)) + +typedef void (register_read)(struct hypctx *, u_int, uint64_t *, void *); +typedef void (register_write)(struct hypctx *, u_int, u_int, u_int, + uint64_t, void *); + +#define VGIC_8_BIT (1 << 0) +/* (1 << 1) is reserved for 16 bit accesses */ +#define VGIC_32_BIT (1 << 2) +#define VGIC_64_BIT (1 << 3) + +struct vgic_register { + u_int start; /* Start within a memory region */ + u_int end; + u_int size; + u_int flags; + register_read *read; + register_write *write; +}; + +#define VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, readf, \ + writef) \ +{ \ + .start = (reg_start), \ + .end = (reg_end), \ + .size = (reg_size), \ + .flags = (reg_flags), \ + .read = (readf), \ + .write = (writef), \ +} + +#define VGIC_REGISTER_RANGE_RAZ_WI(reg_start, reg_end, reg_size, reg_flags) \ + VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, \ + gic_zero_read, gic_ignore_write) + +#define VGIC_REGISTER(start_addr, reg_size, reg_flags, readf, writef) \ + VGIC_REGISTER_RANGE(start_addr, (start_addr) + (reg_size), \ + reg_size, reg_flags, readf, writef) + +#define VGIC_REGISTER_RAZ_WI(start_addr, reg_size, reg_flags) \ + VGIC_REGISTER_RANGE_RAZ_WI(start_addr, \ + (start_addr) + (reg_size), reg_size, reg_flags) + +static register_read gic_pidr2_read; +static register_read gic_zero_read; +static register_write gic_ignore_write; + +/* GICD_CTLR */ +static register_read dist_ctlr_read; +static register_write dist_ctlr_write; +/* GICD_TYPER */ +static register_read dist_typer_read; +/* GICD_IIDR */ +static register_read dist_iidr_read; +/* GICD_STATUSR - RAZ/WI as we don't report errors (yet) */ +/* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ +static register_write dist_setclrspi_nsr_write; +/* GICD_SETSPI_SR - RAZ/WI */ +/* GICD_CLRSPI_SR - RAZ/WI */ +/* GICD_IGROUPR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_ISENABLER */ +static register_read dist_isenabler_read; +static register_write dist_isenabler_write; +/* GICD_ICENABLER */ +static register_read dist_icenabler_read; +static register_write dist_icenabler_write; +/* GICD_ISPENDR */ +static register_read dist_ispendr_read; +static register_write dist_ispendr_write; +/* GICD_ICPENDR */ +static register_read dist_icpendr_read; +static register_write dist_icpendr_write; +/* GICD_ISACTIVER */ +static register_read dist_isactiver_read; +static register_write dist_isactiver_write; +/* GICD_ICACTIVER */ +static register_read dist_icactiver_read; +static register_write dist_icactiver_write; +/* GICD_IPRIORITYR */ +static register_read dist_ipriorityr_read; +static register_write dist_ipriorityr_write; +/* GICD_ITARGETSR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_ICFGR */ +static register_read dist_icfgr_read; +static register_write dist_icfgr_write; +/* GICD_IGRPMODR - RAZ/WI from non-secure mode */ +/* GICD_NSACR - RAZ/WI from non-secure mode */ +/* GICD_SGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_CPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_SPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_IROUTER */ +static register_read dist_irouter_read; +static register_write dist_irouter_write; + +static struct vgic_register dist_registers[] = { + VGIC_REGISTER(GICD_CTLR, 4, VGIC_32_BIT, dist_ctlr_read, + dist_ctlr_write), + VGIC_REGISTER(GICD_TYPER, 4, VGIC_32_BIT, dist_typer_read, + gic_ignore_write), + VGIC_REGISTER(GICD_IIDR, 4, VGIC_32_BIT, dist_iidr_read, + gic_ignore_write), + VGIC_REGISTER_RAZ_WI(GICD_STATUSR, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_SETSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, + dist_setclrspi_nsr_write), + VGIC_REGISTER(GICD_CLRSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, + dist_setclrspi_nsr_write), + VGIC_REGISTER_RAZ_WI(GICD_SETSPI_SR, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICD_CLRSPI_SR, 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_IGROUPR(0), GICD_IGROUPR(1024), 4, + VGIC_32_BIT), + + VGIC_REGISTER_RAZ_WI(GICD_ISENABLER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISENABLER(32), GICD_ISENABLER(1024), 4, + VGIC_32_BIT, dist_isenabler_read, dist_isenabler_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICENABLER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICENABLER(32), GICD_ICENABLER(1024), 4, + VGIC_32_BIT, dist_icenabler_read, dist_icenabler_write), + + VGIC_REGISTER_RAZ_WI(GICD_ISPENDR(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISPENDR(32), GICD_ISPENDR(1024), 4, + VGIC_32_BIT, dist_ispendr_read, dist_ispendr_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICPENDR(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICPENDR(32), GICD_ICPENDR(1024), 4, + VGIC_32_BIT, dist_icpendr_read, dist_icpendr_write), + + VGIC_REGISTER_RAZ_WI(GICD_ISACTIVER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISACTIVER(32), GICD_ISACTIVER(1024), 4, + VGIC_32_BIT, dist_isactiver_read, dist_isactiver_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICACTIVER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICACTIVER(32), GICD_ICACTIVER(1024), 4, + VGIC_32_BIT, dist_icactiver_read, dist_icactiver_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_IPRIORITYR(0), GICD_IPRIORITYR(32), 4, + VGIC_32_BIT | VGIC_8_BIT), + VGIC_REGISTER_RANGE(GICD_IPRIORITYR(32), GICD_IPRIORITYR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_ipriorityr_read, + dist_ipriorityr_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_ITARGETSR(0), GICD_ITARGETSR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_ICFGR(0), GICD_ICFGR(32), 4, + VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICFGR(32), GICD_ICFGR(1024), 4, + VGIC_32_BIT, dist_icfgr_read, dist_icfgr_write), +/* + VGIC_REGISTER_RANGE(GICD_IGRPMODR(0), GICD_IGRPMODR(1024), 4, + VGIC_32_BIT, dist_igrpmodr_read, dist_igrpmodr_write), + VGIC_REGISTER_RANGE(GICD_NSACR(0), GICD_NSACR(1024), 4, + VGIC_32_BIT, dist_nsacr_read, dist_nsacr_write), +*/ + VGIC_REGISTER_RAZ_WI(GICD_SGIR, 4, VGIC_32_BIT), +/* + VGIC_REGISTER_RANGE(GICD_CPENDSGIR(0), GICD_CPENDSGIR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_cpendsgir_read, + dist_cpendsgir_write), + VGIC_REGISTER_RANGE(GICD_SPENDSGIR(0), GICD_SPENDSGIR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_spendsgir_read, + dist_spendsgir_write), +*/ + VGIC_REGISTER_RANGE(GICD_IROUTER(32), GICD_IROUTER(1024), 8, + VGIC_64_BIT | VGIC_32_BIT, dist_irouter_read, dist_irouter_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, + gic_ignore_write), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, VGIC_32_BIT), +}; + +/* GICR_CTLR - Ignore writes as no bits can be set */ +static register_read redist_ctlr_read; +/* GICR_IIDR */ +static register_read redist_iidr_read; +/* GICR_TYPER */ +static register_read redist_typer_read; +/* GICR_STATUSR - RAZ/WI as we don't report errors (yet) */ +/* GICR_WAKER - RAZ/WI from non-secure mode */ +/* GICR_SETLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_CLRLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_PROPBASER - RAZ/WI as no LPIs are supported */ +/* GICR_PENDBASER - RAZ/WI as no LPIs are supported */ +/* GICR_INVLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_INVALLR - RAZ/WI as no LPIs are supported */ +/* GICR_SYNCR - RAZ/WI as no LPIs are supported */ + +static struct vgic_register redist_rd_registers[] = { + VGIC_REGISTER(GICR_CTLR, 4, VGIC_32_BIT, redist_ctlr_read, + gic_ignore_write), + VGIC_REGISTER(GICR_IIDR, 4, VGIC_32_BIT, redist_iidr_read, + gic_ignore_write), + VGIC_REGISTER(GICR_TYPER, 8, VGIC_64_BIT | VGIC_32_BIT, + redist_typer_read, gic_ignore_write), + VGIC_REGISTER_RAZ_WI(GICR_STATUSR, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_WAKER, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_SETLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_CLRLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_PROPBASER, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_PENDBASER, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_INVLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_INVALLR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_SYNCR, 4, VGIC_32_BIT), + + /* These are identical to the dist registers */ + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, + gic_ignore_write), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, + VGIC_32_BIT), +}; + +/* GICR_IGROUPR0 - RAZ/WI from non-secure mode */ +/* GICR_ISENABLER0 */ +static register_read redist_ienabler0_read; +static register_write redist_isenabler0_write; +/* GICR_ICENABLER0 */ +static register_write redist_icenabler0_write; +/* GICR_ISPENDR0 */ +static register_read redist_ipendr0_read; +static register_write redist_ispendr0_write; +/* GICR_ICPENDR0 */ +static register_write redist_icpendr0_write; +/* GICR_ISACTIVER0 */ +static register_read redist_iactiver0_read; +static register_write redist_isactiver0_write; +/* GICR_ICACTIVER0 */ +static register_write redist_icactiver0_write; +/* GICR_IPRIORITYR */ +static register_read redist_ipriorityr_read; +static register_write redist_ipriorityr_write; +/* GICR_ICFGR0 - RAZ/WI from non-secure mode */ +/* GICR_ICFGR1 */ +static register_read redist_icfgr1_read; +static register_write redist_icfgr1_write; +/* GICR_IGRPMODR0 - RAZ/WI from non-secure mode */ +/* GICR_NSCAR - RAZ/WI from non-secure mode */ + +static struct vgic_register redist_sgi_registers[] = { + VGIC_REGISTER_RAZ_WI(GICR_IGROUPR0, 4, VGIC_32_BIT), + VGIC_REGISTER(GICR_ISENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, + redist_isenabler0_write), + VGIC_REGISTER(GICR_ICENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, + redist_icenabler0_write), + VGIC_REGISTER(GICR_ISPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, + redist_ispendr0_write), + VGIC_REGISTER(GICR_ICPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, + redist_icpendr0_write), + VGIC_REGISTER(GICR_ISACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, + redist_isactiver0_write), + VGIC_REGISTER(GICR_ICACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, + redist_icactiver0_write), + VGIC_REGISTER_RANGE(GICR_IPRIORITYR(0), GICR_IPRIORITYR(32), 4, + VGIC_32_BIT | VGIC_8_BIT, redist_ipriorityr_read, + redist_ipriorityr_write), + VGIC_REGISTER_RAZ_WI(GICR_ICFGR0, 4, VGIC_32_BIT), + VGIC_REGISTER(GICR_ICFGR1, 4, VGIC_32_BIT, redist_icfgr1_read, + redist_icfgr1_write), + VGIC_REGISTER_RAZ_WI(GICR_IGRPMODR0, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_NSACR, 4, VGIC_32_BIT), +}; + +static struct vgic_v3_virt_features virt_features; + +static struct vgic_v3_irq *vgic_v3_get_irq(struct hyp *, int, uint32_t); +static void vgic_v3_release_irq(struct vgic_v3_irq *); + +/* TODO: Move to a common file */ +static int +mpidr_to_vcpu(struct hyp *hyp, uint64_t mpidr) +{ + struct vm *vm; + struct hypctx *hypctx; + + vm = hyp->vm; + for (int i = 0; i < vm_get_maxcpus(vm); i++) { + hypctx = hyp->ctx[i]; + if (hypctx != NULL && (hypctx->vmpidr_el2 & GICD_AFF) == mpidr) + return (i); + } + return (-1); +} + +static void +vgic_v3_vminit(device_t dev, struct hyp *hyp) +{ + struct vgic_v3 *vgic; + + hyp->vgic = malloc(sizeof(*hyp->vgic), M_VGIC_V3, + M_WAITOK | M_ZERO); + vgic = hyp->vgic; + + /* + * Configure the Distributor control register. The register resets to an + * architecturally UNKNOWN value, so we reset to 0 to disable all + * functionality controlled by the register. + * + * The exception is GICD_CTLR.DS, which is RA0/WI when the Distributor + * supports one security state (ARM GIC Architecture Specification for + * GICv3 and GICv4, p. 4-464) + */ + vgic->gicd_ctlr = 0; + + mtx_init(&vgic->dist_mtx, "VGICv3 Distributor lock", NULL, + MTX_SPIN); +} + +static void +vgic_v3_cpuinit(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + int i, irqid; + + hypctx->vgic_cpu = malloc(sizeof(*hypctx->vgic_cpu), + M_VGIC_V3, M_WAITOK | M_ZERO); + vgic_cpu = hypctx->vgic_cpu; + + mtx_init(&vgic_cpu->lr_mtx, "VGICv3 ICH_LR_EL2 lock", NULL, MTX_SPIN); + + /* Set the SGI and PPI state */ + for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { + irq = &vgic_cpu->private_irqs[irqid]; + + mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, + MTX_SPIN); + irq->irq = irqid; + irq->mpidr = hypctx->vmpidr_el2 & GICD_AFF; + irq->target_vcpu = vcpu_vcpuid(hypctx->vcpu); + MPASS(irq->target_vcpu >= 0); + + if (irqid < VGIC_SGI_NUM) { + /* SGIs */ + irq->enabled = true; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + irq->priority = 0; + } + + /* + * Configure the Interrupt Controller Hyp Control Register. + * + * ICH_HCR_EL2_En: enable virtual CPU interface. + * + * Maintenance interrupts are disabled. + */ + hypctx->vgic_v3_regs.ich_hcr_el2 = ICH_HCR_EL2_En; + + /* + * Configure the Interrupt Controller Virtual Machine Control Register. + * + * ICH_VMCR_EL2_VPMR: lowest priority mask for the VCPU interface + * ICH_VMCR_EL2_VBPR1_NO_PREEMPTION: disable interrupt preemption for + * Group 1 interrupts + * ICH_VMCR_EL2_VBPR0_NO_PREEMPTION: disable interrupt preemption for + * Group 0 interrupts + * ~ICH_VMCR_EL2_VEOIM: writes to EOI registers perform priority drop + * and interrupt deactivation. + * ICH_VMCR_EL2_VENG0: virtual Group 0 interrupts enabled. + * ICH_VMCR_EL2_VENG1: virtual Group 1 interrupts enabled. + */ + hypctx->vgic_v3_regs.ich_vmcr_el2 = + (virt_features.min_prio << ICH_VMCR_EL2_VPMR_SHIFT) | + ICH_VMCR_EL2_VBPR1_NO_PREEMPTION | ICH_VMCR_EL2_VBPR0_NO_PREEMPTION; + hypctx->vgic_v3_regs.ich_vmcr_el2 &= ~ICH_VMCR_EL2_VEOIM; + hypctx->vgic_v3_regs.ich_vmcr_el2 |= ICH_VMCR_EL2_VENG0 | + ICH_VMCR_EL2_VENG1; + + hypctx->vgic_v3_regs.ich_lr_num = virt_features.ich_lr_num; + for (i = 0; i < hypctx->vgic_v3_regs.ich_lr_num; i++) + hypctx->vgic_v3_regs.ich_lr_el2[i] = 0UL; + vgic_cpu->ich_lr_used = 0; + TAILQ_INIT(&vgic_cpu->irq_act_pend); + + hypctx->vgic_v3_regs.ich_apr_num = virt_features.ich_apr_num; +} + +static void +vgic_v3_cpucleanup(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + int irqid; + + vgic_cpu = hypctx->vgic_cpu; + for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { + irq = &vgic_cpu->private_irqs[irqid]; + mtx_destroy(&irq->irq_spinmtx); + } + + mtx_destroy(&vgic_cpu->lr_mtx); + free(hypctx->vgic_cpu, M_VGIC_V3); +} + +static void +vgic_v3_vmcleanup(device_t dev, struct hyp *hyp) +{ + mtx_destroy(&hyp->vgic->dist_mtx); + free(hyp->vgic, M_VGIC_V3); +} + +static int +vgic_v3_max_cpu_count(device_t dev, struct hyp *hyp) +{ + struct vgic_v3 *vgic; + size_t count; + int16_t max_count; + + vgic = hyp->vgic; + max_count = vm_get_maxcpus(hyp->vm); + + /* No registers, assume the maximum CPUs */ + if (vgic->redist_start == 0 && vgic->redist_end == 0) + return (max_count); + + count = (vgic->redist_end - vgic->redist_start) / + (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); + + /* + * max_count is smaller than INT_MAX so will also limit count + * to a positive integer value. + */ + if (count > max_count) + return (max_count); + + return (count); +} + +static bool +vgic_v3_irq_pending(struct vgic_v3_irq *irq) +{ + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) { + return (irq->pending || irq->level); + } else { + return (irq->pending); + } +} + +static bool +vgic_v3_queue_irq(struct hyp *hyp, struct vgic_v3_cpu *vgic_cpu, + int vcpuid, struct vgic_v3_irq *irq) +{ + MPASS(vcpuid >= 0); + MPASS(vcpuid < vm_get_maxcpus(hyp->vm)); + + mtx_assert(&vgic_cpu->lr_mtx, MA_OWNED); + mtx_assert(&irq->irq_spinmtx, MA_OWNED); + + /* No need to queue the IRQ */ + if (!irq->level && !irq->pending) + return (false); + + if (!irq->on_aplist) { + irq->on_aplist = true; + TAILQ_INSERT_TAIL(&vgic_cpu->irq_act_pend, irq, act_pend_list); + } + return (true); +} + +static uint64_t +gic_reg_value_64(uint64_t field, uint64_t val, u_int offset, u_int size) +{ + uint32_t mask; + + if (offset != 0 || size != 8) { + mask = ((1ul << (size * 8)) - 1) << (offset * 8); + /* Shift the new bits to the correct place */ + val <<= (offset * 8); + /* Keep only the interesting bits */ + val &= mask; + /* Add the bits we are keeping from the old value */ + val |= field & ~mask; + } + + return (val); +} + +static void +gic_pidr2_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = GICR_PIDR2_ARCH_GICv3 << GICR_PIDR2_ARCH_SHIFT; +} + +/* Common read-only/write-ignored helpers */ +static void +gic_zero_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = 0; +} + +static void +gic_ignore_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + /* Nothing to do */ +} + +static uint64_t +read_enabler(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + if (!irq->enabled) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_enabler(struct hypctx *hypctx,int n, bool set, uint64_t val) +{ + struct vgic_v3_irq *irq; + uint32_t irq_base; + int i; + + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + /* Find the interrupt this bit represents */ + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + irq->enabled = set; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_pendr(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + if (vgic_v3_irq_pending(irq)) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static uint64_t +write_pendr(struct hypctx *hypctx, int n, bool set, uint64_t val) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hyp *hyp; + struct hypctx *target_hypctx; + uint64_t ret; + uint32_t irq_base; + int target_vcpu, i; + bool notify; + + hyp = hypctx->hyp; + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + notify = false; + target_vcpu = irq->target_vcpu; + if (target_vcpu < 0) + goto next_irq; + target_hypctx = hyp->ctx[target_vcpu]; + if (target_hypctx == NULL) + goto next_irq; + vgic_cpu = target_hypctx->vgic_cpu; + + if (!set) { + /* pending -> not pending */ + irq->pending = false; + } else { + irq->pending = true; + mtx_lock_spin(&vgic_cpu->lr_mtx); + notify = vgic_v3_queue_irq(hyp, vgic_cpu, target_vcpu, + irq); + mtx_unlock_spin(&vgic_cpu->lr_mtx); + } +next_irq: + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(vm_vcpu(hyp->vm, target_vcpu)); + } + + return (ret); +} + +static uint64_t +read_activer(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + if (irq->active) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_activer(struct hypctx *hypctx, u_int n, bool set, uint64_t val) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hyp *hyp; + struct hypctx *target_hypctx; + uint32_t irq_base; + int target_vcpu, i; + bool notify; + + hyp = hypctx->hyp; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + notify = false; + target_vcpu = irq->target_vcpu; + if (target_vcpu < 0) + goto next_irq; + target_hypctx = hyp->ctx[target_vcpu]; + if (target_hypctx == NULL) + goto next_irq; + vgic_cpu = target_hypctx->vgic_cpu; + + if (!set) { + /* active -> not active */ + irq->active = false; + } else { + /* not active -> active */ + irq->active = true; + mtx_lock_spin(&vgic_cpu->lr_mtx); + notify = vgic_v3_queue_irq(hyp, vgic_cpu, target_vcpu, + irq); + mtx_unlock_spin(&vgic_cpu->lr_mtx); + } +next_irq: + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(vm_vcpu(hyp->vm, target_vcpu)); + } +} + +static uint64_t +read_priorityr(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 4; + for (i = 0; i < 4; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + ret |= ((uint64_t)irq->priority) << (i * 8); + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_priorityr(struct hypctx *hypctx, u_int irq_base, u_int size, uint64_t val) +{ + struct vgic_v3_irq *irq; + int i; + + for (i = 0; i < size; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + /* Set the priority. We support 32 priority steps (5 bits) */ + irq->priority = (val >> (i * 8)) & 0xf8; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_config(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 16; + for (i = 0; i < 16; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + ret |= ((uint64_t)irq->config) << (i * 2); + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_config(struct hypctx *hypctx, int n, uint64_t val) +{ + struct vgic_v3_irq *irq; + uint32_t irq_base; + int i; + + irq_base = n * 16; + for (i = 0; i < 16; i++) { + /* + * The config can't be changed for SGIs and PPIs. SGIs have + * an edge-triggered behaviour, and the register is + * implementation defined to be read-only for PPIs. + */ + if (irq_base + i < VGIC_PRV_I_NUM) + continue; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + /* Bit 0 is RES0 */ + irq->config = (val >> (i * 2)) & VGIC_CONFIG_MASK; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_route(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t mpidr; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), n); + if (irq == NULL) + return (0); + + mpidr = irq->mpidr; + vgic_v3_release_irq(irq); + + return (mpidr); +} + +static void +write_route(struct hypctx *hypctx, int n, uint64_t val, u_int offset, + u_int size) +{ + struct vgic_v3_irq *irq; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), n); + if (irq == NULL) + return; + + irq->mpidr = gic_reg_value_64(irq->mpidr, val, offset, size) & GICD_AFF; + irq->target_vcpu = mpidr_to_vcpu(hypctx->hyp, irq->mpidr); + /* + * If the interrupt is pending we can either use the old mpidr, or + * the new mpidr. To simplify this code we use the old value so we + * don't need to move the interrupt until the next time it is + * moved to the pending state. + */ + vgic_v3_release_irq(irq); +} + +/* + * Distributor register handlers. + */ +/* GICD_CTLR */ +static void +dist_ctlr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + struct hyp *hyp; + struct vgic_v3 *vgic; + + hyp = hypctx->hyp; + vgic = hyp->vgic; + + mtx_lock_spin(&vgic->dist_mtx); + *rval = vgic->gicd_ctlr; + mtx_unlock_spin(&vgic->dist_mtx); + + /* Writes are never pending */ + *rval &= ~GICD_CTLR_RWP; +} + +static void +dist_ctlr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + struct vgic_v3 *vgic; + + MPASS(offset == 0); + MPASS(size == 4); + vgic = hypctx->hyp->vgic; + + /* + * GICv2 backwards compatibility is not implemented so + * ARE_NS is RAO/WI. This means EnableGrp1 is RES0. + * + * EnableGrp1A is supported, and RWP is read-only. + * + * All other bits are RES0 from non-secure mode as we + * implement as if we are in a system with two security + * states. + */ + wval &= GICD_CTLR_G1A; + wval |= GICD_CTLR_ARE_NS; + mtx_lock_spin(&vgic->dist_mtx); + vgic->gicd_ctlr = wval; + /* TODO: Wake any vcpus that have interrupts pending */ + mtx_unlock_spin(&vgic->dist_mtx); +} + +/* GICD_TYPER */ +static void +dist_typer_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + uint32_t typer; + + typer = (10 - 1) << GICD_TYPER_IDBITS_SHIFT; + typer |= GICD_TYPER_MBIS; + /* ITLinesNumber: */ + typer |= howmany(VGIC_NIRQS + 1, 32) - 1; + + *rval = typer; +} + +/* GICD_IIDR */ +static void +dist_iidr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = VGIC_IIDR; +} + +/* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ +static void +dist_setclrspi_nsr_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + uint32_t irqid; + + MPASS(offset == 0); + MPASS(size == 4); + irqid = wval & GICD_SPI_INTID_MASK; + INJECT_IRQ(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irqid, + reg == GICD_SETSPI_NSR); +} + +/* GICD_ISENABLER */ +static void +dist_isenabler_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ISENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_enabler(hypctx, n); +} + +static void +dist_isenabler_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ISENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_enabler(hypctx, n, true, wval); +} + +/* GICD_ICENABLER */ +static void +dist_icenabler_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ICENABLER(0)) / 4; + /* GICD_ICENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_enabler(hypctx, n); +} + +static void +dist_icenabler_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ICENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_enabler(hypctx, n, false, wval); +} + +/* GICD_ISPENDR */ +static void +dist_ispendr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ISPENDR(0)) / 4; + /* GICD_ISPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_pendr(hypctx, n); +} + +static void +dist_ispendr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISPENDR(0)) / 4; + /* GICD_ISPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_pendr(hypctx, n, true, wval); +} + +/* GICD_ICPENDR */ +static void +dist_icpendr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ICPENDR(0)) / 4; + /* GICD_ICPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_pendr(hypctx, n); +} + +static void +dist_icpendr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICPENDR(0)) / 4; + /* GICD_ICPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_pendr(hypctx, n, false, wval); +} + +/* GICD_ISACTIVER */ +/* Affinity routing is enabled so isactiver0 is RAZ/WI */ +static void +dist_isactiver_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ISACTIVER(0)) / 4; + /* GICD_ISACTIVER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_activer(hypctx, n); +} + +static void +dist_isactiver_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISACTIVER(0)) / 4; + /* GICD_ISACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_activer(hypctx, n, true, wval); +} + +/* GICD_ICACTIVER */ +static void +dist_icactiver_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ICACTIVER(0)) / 4; + /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_activer(hypctx, n); +} + +static void +dist_icactiver_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICACTIVER(0)) / 4; + /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_activer(hypctx, n, false, wval); +} + +/* GICD_IPRIORITYR */ +/* Affinity routing is enabled so ipriorityr0-7 is RAZ/WI */ +static void +dist_ipriorityr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_IPRIORITYR(0)) / 4; + /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ + MPASS(n > 7); + *rval = read_priorityr(hypctx, n); +} + +static void +dist_ipriorityr_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + u_int irq_base; + + irq_base = (reg - GICD_IPRIORITYR(0)) + offset; + /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ + MPASS(irq_base > 31); + write_priorityr(hypctx, irq_base, size, wval); +} + +/* GICD_ICFGR */ +static void +dist_icfgr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ICFGR(0)) / 4; + /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ + MPASS(n > 1); + *rval = read_config(hypctx, n); +} + +static void +dist_icfgr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICFGR(0)) / 4; + /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ + MPASS(n > 1); + write_config(hypctx, n, wval); +} + +/* GICD_IROUTER */ +static void +dist_irouter_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_IROUTER(0)) / 8; + /* GICD_IROUTER0-31 don't exist */ + MPASS(n > 31); + *rval = read_route(hypctx, n); +} + +static void +dist_irouter_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + n = (reg - GICD_IROUTER(0)) / 8; + /* GICD_IROUTER0-31 don't exist */ + MPASS(n > 31); + write_route(hypctx, n, wval, offset, size); +} + +static bool +vgic_register_read(struct hypctx *hypctx, struct vgic_register *reg_list, + u_int reg_list_size, u_int reg, u_int size, uint64_t *rval, void *arg) +{ + u_int i, offset; + + for (i = 0; i < reg_list_size; i++) { + if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { + offset = reg & (reg_list[i].size - 1); + reg -= offset; + if ((reg_list[i].flags & size) != 0) { + reg_list[i].read(hypctx, reg, rval, NULL); + + /* Move the bits into the correct place */ + *rval >>= (offset * 8); + if (size < 8) { + *rval &= (1ul << (size * 8)) - 1; + } + } else { + /* + * The access is an invalid size. Section + * 12.1.3 "GIC memory-mapped register access" + * of the GICv3 and GICv4 spec issue H + * (IHI0069) lists the options. For a read + * the controller returns unknown data, in + * this case it is zero. + */ + *rval = 0; + } + return (true); + } + } + return (false); +} + +static bool +vgic_register_write(struct hypctx *hypctx, struct vgic_register *reg_list, + u_int reg_list_size, u_int reg, u_int size, uint64_t wval, void *arg) +{ + u_int i, offset; + + for (i = 0; i < reg_list_size; i++) { + if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { + offset = reg & (reg_list[i].size - 1); + reg -= offset; + if ((reg_list[i].flags & size) != 0) { + reg_list[i].write(hypctx, reg, offset, + size, wval, NULL); + } else { + /* + * See the comment in vgic_register_read. + * For writes the controller ignores the + * operation. + */ + } + return (true); + } + } + return (false); +} + +static int +dist_read(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->dist_start || fault_ipa + size > vgic->dist_end) { + return (EINVAL); + } + + reg = fault_ipa - vgic->dist_start; + /* + * As described in vgic_register_read an access with an invalid + * alignment is read with an unknown value + */ + if ((reg & (size - 1)) != 0) { + *rval = 0; + return (0); + } + + if (vgic_register_read(hypctx, dist_registers, nitems(dist_registers), + reg, size, rval, NULL)) + return (0); + + /* Reserved register addresses are RES0 so we can hardware it to 0 */ + *rval = 0; + + return (0); +} + +static int +dist_write(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->dist_start || fault_ipa + size > vgic->dist_end) { + return (EINVAL); + } + + reg = fault_ipa - vgic->dist_start; + /* + * As described in vgic_register_read an access with an invalid + * alignment is write ignored. + */ + if ((reg & (size - 1)) != 0) + return (0); + + if (vgic_register_write(hypctx, dist_registers, nitems(dist_registers), + reg, size, wval, NULL)) + return (0); + + /* Reserved register addresses are RES0 so we can ignore the write */ + return (0); +} + +/* + * Redistributor register handlers. + * + * RD_base: + */ +/* GICR_CTLR */ +static void +redist_ctlr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + /* LPIs not supported */ + *rval = 0; +} + +/* GICR_IIDR */ +static void +redist_iidr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = VGIC_IIDR; +} + +/* GICR_TYPER */ +static void +redist_typer_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + uint64_t aff, gicr_typer, vmpidr_el2; + bool last_vcpu; + + last_vcpu = false; + if (vcpu_vcpuid(hypctx->vcpu) == (vgic_max_cpu_count(hypctx->hyp) - 1)) + last_vcpu = true; + + vmpidr_el2 = hypctx->vmpidr_el2; + MPASS(vmpidr_el2 != 0); + /* + * Get affinity for the current CPU. The guest CPU affinity is taken + * from VMPIDR_EL2. The Redistributor corresponding to this CPU is + * the Redistributor with the same affinity from GICR_TYPER. + */ + aff = (CPU_AFF3(vmpidr_el2) << 24) | (CPU_AFF2(vmpidr_el2) << 16) | + (CPU_AFF1(vmpidr_el2) << 8) | CPU_AFF0(vmpidr_el2); + + /* Set up GICR_TYPER. */ + gicr_typer = aff << GICR_TYPER_AFF_SHIFT; + /* Set the vcpu as the processsor ID */ + gicr_typer |= + (uint64_t)vcpu_vcpuid(hypctx->vcpu) << GICR_TYPER_CPUNUM_SHIFT; + + if (last_vcpu) + /* Mark the last Redistributor */ + gicr_typer |= GICR_TYPER_LAST; + + *rval = gicr_typer; +} + +/* + * SGI_base: + */ +/* GICR_ISENABLER0 */ +static void +redist_ienabler0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_enabler(hypctx, 0); +} + +static void +redist_isenabler0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_enabler(hypctx, 0, true, wval); +} + +/* GICR_ICENABLER0 */ +static void +redist_icenabler0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_enabler(hypctx, 0, false, wval); +} + +/* GICR_ISPENDR0 */ +static void +redist_ipendr0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_pendr(hypctx, 0); +} + +static void +redist_ispendr0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_pendr(hypctx, 0, true, wval); +} + +/* GICR_ICPENDR0 */ +static void +redist_icpendr0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_pendr(hypctx, 0, false, wval); +} + +/* GICR_ISACTIVER0 */ +static void +redist_iactiver0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_activer(hypctx, 0); +} + +static void +redist_isactiver0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + write_activer(hypctx, 0, true, wval); +} + +/* GICR_ICACTIVER0 */ +static void +redist_icactiver0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + write_activer(hypctx, 0, false, wval); +} + +/* GICR_IPRIORITYR */ +static void +redist_ipriorityr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICR_IPRIORITYR(0)) / 4; + *rval = read_priorityr(hypctx, n); +} + +static void +redist_ipriorityr_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + u_int irq_base; + + irq_base = (reg - GICR_IPRIORITYR(0)) + offset; + write_priorityr(hypctx, irq_base, size, wval); +} + +/* GICR_ICFGR1 */ +static void +redist_icfgr1_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = read_config(hypctx, 1); +} + +static void +redist_icfgr1_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_config(hypctx, 1, wval); +} + +static int +redist_read(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx, *target_hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + int vcpuid; + + /* Find the current vcpu ctx to get the vgic struct */ + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->redist_start || + fault_ipa + size > vgic->redist_end) { + return (EINVAL); + } + + vcpuid = (fault_ipa - vgic->redist_start) / + (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); + if (vcpuid >= vm_get_maxcpus(hyp->vm)) { + /* + * This should never happen, but lets be defensive so if it + * does we don't panic a non-INVARIANTS kernel. + */ +#ifdef INVARIANTS + panic("%s: Invalid vcpuid %d", __func__, vcpuid); +#else + *rval = 0; + return (0); +#endif + } + + /* Find the target vcpu ctx for the access */ + target_hypctx = hyp->ctx[vcpuid]; + if (target_hypctx == NULL) { + /* + * The CPU has not yet started. The redistributor and CPU are + * in the same power domain. As such the redistributor will + * also be powered down so any access will raise an external + * abort. + */ + raise_data_insn_abort(hypctx, fault_ipa, true, + ISS_DATA_DFSC_EXT); + return (0); + } + + reg = (fault_ipa - vgic->redist_start) % + (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); + + /* + * As described in vgic_register_read an access with an invalid + * alignment is read with an unknown value + */ + if ((reg & (size - 1)) != 0) { + *rval = 0; + return (0); + } + + if (reg < GICR_RD_BASE_SIZE) { + if (vgic_register_read(target_hypctx, redist_rd_registers, + nitems(redist_rd_registers), reg, size, rval, NULL)) + return (0); + } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { + if (vgic_register_read(target_hypctx, redist_sgi_registers, + nitems(redist_sgi_registers), reg - GICR_SGI_BASE, size, + rval, NULL)) + return (0); + } + + /* Reserved register addresses are RES0 so we can hardware it to 0 */ + *rval = 0; + return (0); +} + +static int +redist_write(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx, *target_hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + int vcpuid; + + /* Find the current vcpu ctx to get the vgic struct */ + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->redist_start || + fault_ipa + size > vgic->redist_end) { + return (EINVAL); + } + + vcpuid = (fault_ipa - vgic->redist_start) / + (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); + if (vcpuid >= vm_get_maxcpus(hyp->vm)) { + /* + * This should never happen, but lets be defensive so if it + * does we don't panic a non-INVARIANTS kernel. + */ +#ifdef INVARIANTS + panic("%s: Invalid vcpuid %d", __func__, vcpuid); +#else + return (0); +#endif + } + + /* Find the target vcpu ctx for the access */ + target_hypctx = hyp->ctx[vcpuid]; + if (target_hypctx == NULL) { + /* + * The CPU has not yet started. The redistributor and CPU are + * in the same power domain. As such the redistributor will + * also be powered down so any access will raise an external + * abort. + */ + raise_data_insn_abort(hypctx, fault_ipa, true, + ISS_DATA_DFSC_EXT); + return (0); + } + + reg = (fault_ipa - vgic->redist_start) % + (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); + + /* + * As described in vgic_register_read an access with an invalid + * alignment is write ignored. + */ + if ((reg & (size - 1)) != 0) + return (0); + + if (reg < GICR_RD_BASE_SIZE) { + if (vgic_register_write(target_hypctx, redist_rd_registers, + nitems(redist_rd_registers), reg, size, wval, NULL)) + return (0); + } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { + if (vgic_register_write(target_hypctx, redist_sgi_registers, + nitems(redist_sgi_registers), reg - GICR_SGI_BASE, size, + wval, NULL)) + return (0); + } + + /* Reserved register addresses are RES0 so we can ignore the write */ + return (0); +} + +static int +vgic_v3_icc_sgi1r_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + /* + * TODO: Inject an unknown exception. + */ + *rval = 0; + return (0); +} + +static int +vgic_v3_icc_sgi1r_write(struct vcpu *vcpu, uint64_t rval, void *arg) +{ + struct vm *vm; + struct hyp *hyp; + cpuset_t active_cpus; + uint64_t mpidr, aff1, aff2, aff3; + uint32_t irqid; + int cpus, cpu_off, target_vcpuid, vcpuid; + + vm = vcpu_vm(vcpu); + hyp = vm_get_cookie(vm); + active_cpus = vm_active_cpus(vm); + vcpuid = vcpu_vcpuid(vcpu); + + irqid = ICC_SGI1R_EL1_SGIID_VAL(rval) >> ICC_SGI1R_EL1_SGIID_SHIFT; + if ((rval & ICC_SGI1R_EL1_IRM) == 0) { + /* Non-zero points at no vcpus */ + if (ICC_SGI1R_EL1_RS_VAL(rval) != 0) + return (0); + + aff1 = ICC_SGI1R_EL1_AFF1_VAL(rval) >> ICC_SGI1R_EL1_AFF1_SHIFT; + aff2 = ICC_SGI1R_EL1_AFF2_VAL(rval) >> ICC_SGI1R_EL1_AFF2_SHIFT; + aff3 = ICC_SGI1R_EL1_AFF3_VAL(rval) >> ICC_SGI1R_EL1_AFF3_SHIFT; + mpidr = aff3 << MPIDR_AFF3_SHIFT | + aff2 << MPIDR_AFF2_SHIFT | aff1 << MPIDR_AFF1_SHIFT; + + cpus = ICC_SGI1R_EL1_TL_VAL(rval) >> ICC_SGI1R_EL1_TL_SHIFT; + cpu_off = 0; + while (cpus > 0) { + if (cpus & 1) { + target_vcpuid = mpidr_to_vcpu(hyp, + mpidr | (cpu_off << MPIDR_AFF0_SHIFT)); + if (target_vcpuid >= 0 && + CPU_ISSET(target_vcpuid, &active_cpus)) { + INJECT_IRQ(hyp, target_vcpuid, irqid, + true); + } + } + cpu_off++; + cpus >>= 1; + } + } else { + /* Send an IPI to all CPUs other than the current CPU */ + for (target_vcpuid = 0; target_vcpuid < vm_get_maxcpus(vm); + target_vcpuid++) { + if (CPU_ISSET(target_vcpuid, &active_cpus) && + target_vcpuid != vcpuid) { + INJECT_IRQ(hyp, target_vcpuid, irqid, true); + } + } + } + + return (0); +} + +static void +vgic_v3_mmio_init(struct hyp *hyp) +{ + struct vgic_v3 *vgic; + struct vgic_v3_irq *irq; + int i; + + /* Allocate memory for the SPIs */ + vgic = hyp->vgic; + vgic->irqs = malloc((VGIC_NIRQS - VGIC_PRV_I_NUM) * + sizeof(*vgic->irqs), M_VGIC_V3, M_WAITOK | M_ZERO); + + for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { + irq = &vgic->irqs[i]; + + mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, + MTX_SPIN); + + irq->irq = i + VGIC_PRV_I_NUM; + } +} + +static void +vgic_v3_mmio_destroy(struct hyp *hyp) +{ + struct vgic_v3 *vgic; + struct vgic_v3_irq *irq; + int i; + + vgic = hyp->vgic; + for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { + irq = &vgic->irqs[i]; + + mtx_destroy(&irq->irq_spinmtx); + } + + free(vgic->irqs, M_VGIC_V3); +} + +static int +vgic_v3_attach_to_vm(device_t dev, struct hyp *hyp, struct vm_vgic_descr *descr) +{ + struct vm *vm; + struct vgic_v3 *vgic; + size_t cpu_count; + + if (descr->ver.version != 3) + return (EINVAL); + + /* + * The register bases need to be 64k aligned + * The redist register space is the RD + SGI size + */ + if (!__is_aligned(descr->v3_regs.dist_start, PAGE_SIZE_64K) || + !__is_aligned(descr->v3_regs.redist_start, PAGE_SIZE_64K) || + !__is_aligned(descr->v3_regs.redist_size, + GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE)) + return (EINVAL); + + /* The dist register space is 1 64k block */ + if (descr->v3_regs.dist_size != PAGE_SIZE_64K) + return (EINVAL); + + vm = hyp->vm; + + /* + * Return an error if the redist space is too large for the maximum + * number of CPUs we support. + */ + cpu_count = descr->v3_regs.redist_size / + (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); + if (cpu_count > vm_get_maxcpus(vm)) + return (EINVAL); + + vgic = hyp->vgic; + + /* Set the distributor address and size for trapping guest access. */ + vgic->dist_start = descr->v3_regs.dist_start; + vgic->dist_end = descr->v3_regs.dist_start + descr->v3_regs.dist_size; + + vgic->redist_start = descr->v3_regs.redist_start; + vgic->redist_end = descr->v3_regs.redist_start + + descr->v3_regs.redist_size; + + vm_register_inst_handler(vm, descr->v3_regs.dist_start, + descr->v3_regs.dist_size, dist_read, dist_write); + vm_register_inst_handler(vm, descr->v3_regs.redist_start, + descr->v3_regs.redist_size, redist_read, redist_write); + + vm_register_reg_handler(vm, ISS_MSR_REG(ICC_SGI1R_EL1), + ISS_MSR_REG_MASK, vgic_v3_icc_sgi1r_read, vgic_v3_icc_sgi1r_write, + NULL); + + vgic_v3_mmio_init(hyp); + + hyp->vgic_attached = true; + + return (0); +} + +static void +vgic_v3_detach_from_vm(device_t dev, struct hyp *hyp) +{ + if (hyp->vgic_attached) { + hyp->vgic_attached = false; + vgic_v3_mmio_destroy(hyp); + } +} + +static struct vgic_v3_irq * +vgic_v3_get_irq(struct hyp *hyp, int vcpuid, uint32_t irqid) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hypctx *hypctx; + + if (irqid < VGIC_PRV_I_NUM) { + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(hyp->vm)) + return (NULL); + hypctx = hyp->ctx[vcpuid]; + if (hypctx == NULL) + return (NULL); + vgic_cpu = hypctx->vgic_cpu; + irq = &vgic_cpu->private_irqs[irqid]; + } else if (irqid <= GIC_LAST_SPI) { + irqid -= VGIC_PRV_I_NUM; + if (irqid >= VGIC_NIRQS) + return (NULL); + irq = &hyp->vgic->irqs[irqid]; + } else if (irqid < GIC_FIRST_LPI) { + return (NULL); + } else { + /* No support for LPIs */ + return (NULL); + } + + mtx_lock_spin(&irq->irq_spinmtx); + return (irq); +} + +static void +vgic_v3_release_irq(struct vgic_v3_irq *irq) +{ + + mtx_unlock_spin(&irq->irq_spinmtx); +} + +static bool +vgic_v3_has_pending_irq(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + bool empty; + + vgic_cpu = hypctx->vgic_cpu; + mtx_lock_spin(&vgic_cpu->lr_mtx); + empty = TAILQ_EMPTY(&vgic_cpu->irq_act_pend); + mtx_unlock_spin(&vgic_cpu->lr_mtx); + + return (!empty); +} + +static bool +vgic_v3_check_irq(struct vgic_v3_irq *irq, bool level) +{ + /* + * Only inject if: + * - Level-triggered IRQ: level changes low -> high + * - Edge-triggered IRQ: level is high + */ + switch (irq->config & VGIC_CONFIG_MASK) { + case VGIC_CONFIG_LEVEL: + return (level != irq->level); + case VGIC_CONFIG_EDGE: + return (level); + default: + break; + } + + return (false); +} + +static int +vgic_v3_inject_irq(device_t dev, struct hyp *hyp, int vcpuid, uint32_t irqid, + bool level) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hypctx *hypctx; + int target_vcpu; + bool notify; + + if (!hyp->vgic_attached) + return (ENODEV); + + KASSERT(vcpuid == -1 || irqid < VGIC_PRV_I_NUM, + ("%s: SPI/LPI with vcpuid set: irq %u vcpuid %u", __func__, irqid, + vcpuid)); + + irq = vgic_v3_get_irq(hyp, vcpuid, irqid); + if (irq == NULL) { + eprintf("Malformed IRQ %u.\n", irqid); + return (EINVAL); + } + + target_vcpu = irq->target_vcpu; + KASSERT(vcpuid == -1 || vcpuid == target_vcpu, + ("%s: Interrupt %u has bad cpu affinity: vcpu %d target vcpu %d", + __func__, irqid, vcpuid, target_vcpu)); + KASSERT(target_vcpu >= 0 && target_vcpu < vm_get_maxcpus(hyp->vm), + ("%s: Interrupt %u sent to invalid vcpu %d", __func__, irqid, + target_vcpu)); + + if (vcpuid == -1) + vcpuid = target_vcpu; + /* TODO: Check from 0 to vm->maxcpus */ + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(hyp->vm)) { + vgic_v3_release_irq(irq); + return (EINVAL); + } + + hypctx = hyp->ctx[vcpuid]; + if (hypctx == NULL) { + vgic_v3_release_irq(irq); + return (EINVAL); + } + + notify = false; + vgic_cpu = hypctx->vgic_cpu; + + mtx_lock_spin(&vgic_cpu->lr_mtx); + + if (!vgic_v3_check_irq(irq, level)) { + goto out; + } + + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) + irq->level = level; + else /* VGIC_CONFIG_EDGE */ + irq->pending = true; + + notify = vgic_v3_queue_irq(hyp, vgic_cpu, vcpuid, irq); + +out: + mtx_unlock_spin(&vgic_cpu->lr_mtx); + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(vm_vcpu(hyp->vm, vcpuid)); + + return (0); +} + +static int +vgic_v3_inject_msi(device_t dev, struct hyp *hyp, uint64_t msg, uint64_t addr) +{ + struct vgic_v3 *vgic; + uint64_t reg; + + vgic = hyp->vgic; + + /* This is a 4 byte register */ + if (addr < vgic->dist_start || addr + 4 > vgic->dist_end) { + return (EINVAL); + } + + reg = addr - vgic->dist_start; + if (reg != GICD_SETSPI_NSR) + return (EINVAL); + + return (INJECT_IRQ(hyp, -1, msg, true)); +} + +static void +vgic_v3_flush_hwstate(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + int i; + + vgic_cpu = hypctx->vgic_cpu; + + /* + * All Distributor writes have been executed at this point, do not + * protect Distributor reads with a mutex. + * + * This is callled with all interrupts disabled, so there is no need for + * a List Register spinlock either. + */ + mtx_lock_spin(&vgic_cpu->lr_mtx); + + hypctx->vgic_v3_regs.ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + + /* Exit early if there are no buffered interrupts */ + if (TAILQ_EMPTY(&vgic_cpu->irq_act_pend)) + goto out; + + KASSERT(vgic_cpu->ich_lr_used == 0, ("%s: Used LR count not zero %u", + __func__, vgic_cpu->ich_lr_used)); + + i = 0; + hypctx->vgic_v3_regs.ich_elrsr_el2 = + (1u << hypctx->vgic_v3_regs.ich_lr_num) - 1; + TAILQ_FOREACH(irq, &vgic_cpu->irq_act_pend, act_pend_list) { + /* No free list register, stop searching for IRQs */ + if (i == hypctx->vgic_v3_regs.ich_lr_num) + break; + + if (!irq->enabled) + continue; + + hypctx->vgic_v3_regs.ich_lr_el2[i] = ICH_LR_EL2_GROUP1 | + ((uint64_t)irq->priority << ICH_LR_EL2_PRIO_SHIFT) | + irq->irq; + + if (irq->active) { + hypctx->vgic_v3_regs.ich_lr_el2[i] |= + ICH_LR_EL2_STATE_ACTIVE; + } + +#ifdef notyet + /* TODO: Check why this is needed */ + if ((irq->config & _MASK) == LEVEL) + hypctx->vgic_v3_regs.ich_lr_el2[i] |= ICH_LR_EL2_EOI; +#endif + + if (!irq->active && vgic_v3_irq_pending(irq)) { + hypctx->vgic_v3_regs.ich_lr_el2[i] |= + ICH_LR_EL2_STATE_PENDING; + + /* + * This IRQ is now pending on the guest. Allow for + * another edge that could cause the interrupt to + * be raised again. + */ + if ((irq->config & VGIC_CONFIG_MASK) == + VGIC_CONFIG_EDGE) { + irq->pending = false; + } + } + + i++; + } + vgic_cpu->ich_lr_used = i; + +out: + mtx_unlock_spin(&vgic_cpu->lr_mtx); +} + +static void +vgic_v3_sync_hwstate(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + uint64_t lr; + int i; + + vgic_cpu = hypctx->vgic_cpu; + + /* Exit early if there are no buffered interrupts */ + if (vgic_cpu->ich_lr_used == 0) + return; + + /* + * Check on the IRQ state after running the guest. ich_lr_used and + * ich_lr_el2 are only ever used within this thread so is safe to + * access unlocked. + */ + for (i = 0; i < vgic_cpu->ich_lr_used; i++) { + lr = hypctx->vgic_v3_regs.ich_lr_el2[i]; + hypctx->vgic_v3_regs.ich_lr_el2[i] = 0; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + ICH_LR_EL2_VINTID(lr)); + if (irq == NULL) + continue; + + irq->active = (lr & ICH_LR_EL2_STATE_ACTIVE) != 0; + + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_EDGE) { + /* + * If we have an edge triggered IRQ preserve the + * pending bit until the IRQ has been handled. + */ + if ((lr & ICH_LR_EL2_STATE_PENDING) != 0) { + irq->pending = true; + } + } else { + /* + * If we have a level triggerend IRQ remove the + * pending bit if the IRQ has been handled. + * The level is separate, so may still be high + * triggering another IRQ. + */ + if ((lr & ICH_LR_EL2_STATE_PENDING) == 0) { + irq->pending = false; + } + } + + /* Lock to update irq_act_pend */ + mtx_lock_spin(&vgic_cpu->lr_mtx); + if (irq->active) { + /* Ensure the active IRQ is at the head of the list */ + TAILQ_REMOVE(&vgic_cpu->irq_act_pend, irq, + act_pend_list); + TAILQ_INSERT_HEAD(&vgic_cpu->irq_act_pend, irq, + act_pend_list); + } else if (!vgic_v3_irq_pending(irq)) { + /* If pending or active remove from the list */ + TAILQ_REMOVE(&vgic_cpu->irq_act_pend, irq, + act_pend_list); + irq->on_aplist = false; + } + mtx_unlock_spin(&vgic_cpu->lr_mtx); + vgic_v3_release_irq(irq); + } + + hypctx->vgic_v3_regs.ich_hcr_el2 &= ~ICH_HCR_EL2_EOICOUNT_MASK; + vgic_cpu->ich_lr_used = 0; +} + +static void +vgic_v3_init(device_t dev) +{ + uint64_t ich_vtr_el2; + uint32_t pribits, prebits; + + ich_vtr_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_ICH_VTR); + + /* TODO: These fields are common with the vgicv2 driver */ + pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); + switch (pribits) { + default: + case 5: + virt_features.min_prio = 0xf8; + break; + case 6: + virt_features.min_prio = 0xfc; + break; + case 7: + virt_features.min_prio = 0xfe; + break; + case 8: + virt_features.min_prio = 0xff; + break; + } + + prebits = ICH_VTR_EL2_PREBITS(ich_vtr_el2); + switch (prebits) { + default: + case 5: + virt_features.ich_apr_num = 1; + break; + case 6: + virt_features.ich_apr_num = 2; + break; + case 7: + virt_features.ich_apr_num = 4; + break; + } + + virt_features.ich_lr_num = ICH_VTR_EL2_LISTREGS(ich_vtr_el2); +} + +static int +vgic_v3_probe(device_t dev) +{ + if (!gic_get_vgic(dev)) + return (EINVAL); + + /* We currently only support the GICv3 */ + if (gic_get_hw_rev(dev) < 3) + return (EINVAL); + + device_set_desc(dev, "Virtual GIC v3"); + return (BUS_PROBE_DEFAULT); +} + +static int +vgic_v3_attach(device_t dev) +{ + vgic_dev = dev; + return (0); +} + +static int +vgic_v3_detach(device_t dev) +{ + vgic_dev = NULL; + return (0); +} + +static device_method_t vgic_v3_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vgic_v3_probe), + DEVMETHOD(device_attach, vgic_v3_attach), + DEVMETHOD(device_detach, vgic_v3_detach), + + /* VGIC interface */ + DEVMETHOD(vgic_init, vgic_v3_init), + DEVMETHOD(vgic_attach_to_vm, vgic_v3_attach_to_vm), + DEVMETHOD(vgic_detach_from_vm, vgic_v3_detach_from_vm), + DEVMETHOD(vgic_vminit, vgic_v3_vminit), + DEVMETHOD(vgic_cpuinit, vgic_v3_cpuinit), + DEVMETHOD(vgic_cpucleanup, vgic_v3_cpucleanup), + DEVMETHOD(vgic_vmcleanup, vgic_v3_vmcleanup), + DEVMETHOD(vgic_max_cpu_count, vgic_v3_max_cpu_count), + DEVMETHOD(vgic_has_pending_irq, vgic_v3_has_pending_irq), + DEVMETHOD(vgic_inject_irq, vgic_v3_inject_irq), + DEVMETHOD(vgic_inject_msi, vgic_v3_inject_msi), + DEVMETHOD(vgic_flush_hwstate, vgic_v3_flush_hwstate), + DEVMETHOD(vgic_sync_hwstate, vgic_v3_sync_hwstate), + + /* End */ + DEVMETHOD_END +}; + +/* TODO: Create a vgic base class? */ +DEFINE_CLASS_0(vgic, vgic_v3_driver, vgic_v3_methods, 0); + +DRIVER_MODULE(vgic_v3, gic, vgic_v3_driver, 0, 0); diff --git a/sys/arm64/vmm/io/vgic_v3.h b/sys/arm64/vmm/io/vgic_v3.h new file mode 100644 index 000000000000..8804cc7a0211 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3.h @@ -0,0 +1,57 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VGIC_V3_H_ +#define _VMM_VGIC_V3_H_ + +#define VGIC_ICH_LR_NUM_MAX 16 +#define VGIC_ICH_APR_NUM_MAX 4 + +/* Registers accessed by EL2 */ +struct vgic_v3_regs { + uint32_t ich_eisr_el2; /* End of Interrupt Status Register */ + uint32_t ich_elrsr_el2; /* Empty List register Status Register (ICH_ELRSR_EL2) */ + uint32_t ich_hcr_el2; /* Hyp Control Register */ + uint32_t ich_misr_el2; /* Maintenance Interrupt State Register */ + uint32_t ich_vmcr_el2; /* Virtual Machine Control Register */ + + /* + * The List Registers are part of the VM context and are modified on a + * world switch. They need to be allocated statically so they are + * mapped in the EL2 translation tables when struct hypctx is mapped. + */ + uint64_t ich_lr_el2[VGIC_ICH_LR_NUM_MAX]; + uint16_t ich_lr_num; + + /* Active Priorities Registers for Group 0 and 1 interrupts */ + uint16_t ich_apr_num; + uint32_t ich_ap0r_el2[VGIC_ICH_APR_NUM_MAX]; + uint32_t ich_ap1r_el2[VGIC_ICH_APR_NUM_MAX]; +}; + +#endif /* !_VMM_VGIC_V3_H_ */ diff --git a/sys/arm64/vmm/io/vgic_v3_reg.h b/sys/arm64/vmm/io/vgic_v3_reg.h new file mode 100644 index 000000000000..e805ded17097 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3_reg.h @@ -0,0 +1,129 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 The FreeBSD Foundation + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VGIC_V3_REG_H_ +#define _VGIC_V3_REG_H_ + +/* Interrupt Controller End of Interrupt Status Register */ +#define ICH_EISR_EL2_STATUS_MASK 0xffff +#define ICH_EISR_EL2_EOI_NOT_HANDLED(lr) ((1 << lr) & ICH_EISR_EL2_STATUS_MASK) + +/* Interrupt Controller Empty List Register Status Register */ +#define ICH_ELSR_EL2_STATUS_MASK 0xffff +#define ICH_ELSR_EL2_LR_EMPTY(x) ((1 << x) & ICH_ELSR_EL2_STATUS_MASK) + +/* Interrupt Controller Hyp Control Register */ +#define ICH_HCR_EL2_EOICOUNT_SHIFT 27 +#define ICH_HCR_EL2_EOICOUNT_MASK (0x1f << ICH_HCR_EL2_EOICOUNT_SHIFT) +#define ICH_HCR_EL2_TDIR (1 << 14) /* Trap non-secure EL1 writes to IC{C, V}_DIR_EL1 */ +#define ICH_HCR_EL2_TSEI (1 << 14) /* Trap System Error Interupts (SEI) to EL2 */ +#define ICH_HCR_EL2_TALL1 (1 << 12) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 1 interrupts */ +#define ICH_HCR_EL2_TALL0 (1 << 11) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 0 interrupts */ +#define ICH_HCR_EL2_TC (1 << 10) /* Trap non-secure EL1 accesses to common IC{C, V}_* registers */ +#define ICH_HCR_EL2_VGRP1DIE (1 << 7) /* VM Group 1 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP1EIE (1 << 6) /* VM Group 1 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0DIE (1 << 5) /* VM Group 0 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0EIE (1 << 4) /* VM Group 0 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_NPIE (1 << 3) /* No Pending Interrupt Enable */ +#define ICH_HCR_EL2_LRENPIE (1 << 2) /* List Register Entry Not Present Interrupt Enable */ +#define ICH_HCR_EL2_UIE (1 << 1) /* Underflow Interrupt Enable */ +#define ICH_HCR_EL2_En (1 << 0) /* Global enable for the virtual CPU interface */ + +/* Interrupt Controller List Registers */ +#define ICH_LR_EL2_VINTID_MASK 0xffffffff +#define ICH_LR_EL2_VINTID(x) ((x) & ICH_LR_EL2_VINTID_MASK) +#define ICH_LR_EL2_PINTID_SHIFT 32 +#define ICH_LR_EL2_PINTID_MASK (0x3fUL << ICH_LR_EL2_PINTID_SHIFT) +/* Raise a maintanance IRQ when deactivated (only non-HW virqs) */ +#define ICH_LR_EL2_EOI (1UL << 41) +#define ICH_LR_EL2_PRIO_SHIFT 48 +#define ICH_LR_EL2_PRIO_MASK (0xffUL << ICH_LR_EL2_PRIO_SHIFT) +#define ICH_LR_EL2_GROUP_SHIFT 60 +#define ICH_LR_EL2_GROUP1 (1UL << ICH_LR_EL2_GROUP_SHIFT) +#define ICH_LR_EL2_HW (1UL << 61) +#define ICH_LR_EL2_STATE_SHIFT 62 +#define ICH_LR_EL2_STATE_MASK (0x3UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE(x) ((x) & ICH_LR_EL2_STATE_MASK) +#define ICH_LR_EL2_STATE_INACTIVE (0x0UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING (0x1UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_ACTIVE (0x2UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING_ACTIVE (0x3UL << ICH_LR_EL2_STATE_SHIFT) + +/* Interrupt Controller Maintenance Interrupt State Register */ +#define ICH_MISR_EL2_VGRP1D (1 << 7) /* vPE Group 1 Disabled */ +#define ICH_MISR_EL2_VGRP1E (1 << 6) /* vPE Group 1 Enabled */ +#define ICH_MISR_EL2_VGRP0D (1 << 5) /* vPE Group 0 Disabled */ +#define ICH_MISR_EL2_VGRP0E (1 << 4) /* vPE Group 0 Enabled */ +#define ICH_MISR_EL2_NP (1 << 3) /* No Pending */ +#define ICH_MISR_EL2_LRENP (1 << 2) /* List Register Entry Not Present */ +#define ICH_MISR_EL2_U (1 << 1) /* Underflow */ +#define ICH_MISR_EL2_EOI (1 << 0) /* End Of Interrupt */ + +/* Interrupt Controller Virtual Machine Control Register */ +#define ICH_VMCR_EL2_VPMR_SHIFT 24 +#define ICH_VMCR_EL2_VPMR_MASK (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_LOWEST (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_HIGHEST (0x00 << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VBPR0_SHIFT 21 +#define ICH_VMCR_EL2_VBPR0_MASK (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR0_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR1_SHIFT 18 +#define ICH_VMCR_EL2_VBPR1_MASK (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VBPR1_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VEOIM (1 << 9) /* Virtual EOI mode */ +#define ICH_VMCR_EL2_VCBPR (1 << 4) /* Virtual Common binary Point Register */ +#define ICH_VMCR_EL2_VFIQEN (1 << 3) /* Virtual FIQ enable */ +#define ICH_VMCR_EL2_VACKCTL (1 << 2) /* Virtual AckCtl */ +#define ICH_VMCR_EL2_VENG1 (1 << 1) /* Virtual Group 1 Interrupt Enable */ +#define ICH_VMCR_EL2_VENG0 (1 << 0) /* Virtual Group 0 Interrupt Enable */ + +/* Interrupt Controller VGIC Type Register */ +#define ICH_VTR_EL2_PRIBITS_SHIFT 29 +#define ICH_VTR_EL2_PRIBITS_MASK (0x7 << ICH_VTR_EL2_PRIBITS_SHIFT) +#define ICH_VTR_EL2_PRIBITS(x) \ + ((((x) & ICH_VTR_EL2_PRIBITS_MASK) >> ICH_VTR_EL2_PRIBITS_SHIFT) + 1) +#define ICH_VTR_EL2_PREBITS_SHIFT 26 +#define ICH_VTR_EL2_PREBITS_MASK (0x7 << ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_PREBITS(x) \ + (((x) & ICH_VTR_EL2_PREBITS_MASK) >> ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_SEIS (1 << 22) /* System Error Interrupt (SEI) Support */ +#define ICH_VTR_EL2_A3V (1 << 21) /* Affinity 3 Valid */ +#define ICH_VTR_EL2_NV4 (1 << 20) /* Direct injection of virtual interrupts. RES1 for GICv3 */ +#define ICH_VTR_EL2_TDS (1 << 19) /* Implementation supports ICH_HCR_EL2.TDIR */ +#define ICH_VTR_EL2_LISTREGS_MASK 0x1f +/* + * ICH_VTR_EL2.ListRegs holds the number of list registers, minus one. Add one + * to get the actual number of list registers. + */ +#define ICH_VTR_EL2_LISTREGS(x) (((x) & ICH_VTR_EL2_LISTREGS_MASK) + 1) + +#endif /* !_VGIC_V3_REG_H_ */ diff --git a/sys/arm64/vmm/io/vtimer.c b/sys/arm64/vmm/io/vtimer.c new file mode 100644 index 000000000000..aa0b3ff1588e --- /dev/null +++ b/sys/arm64/vmm/io/vtimer.c @@ -0,0 +1,503 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/rman.h> +#include <sys/time.h> +#include <sys/timeet.h> +#include <sys/timetc.h> + +#include <machine/bus.h> +#include <machine/machdep.h> +#include <machine/vmm.h> +#include <machine/armreg.h> + +#include <arm64/vmm/arm64.h> + +#include "vgic.h" +#include "vtimer.h" + +#define RES1 0xffffffffffffffffUL + +#define timer_enabled(ctl) \ + (!((ctl) & CNTP_CTL_IMASK) && ((ctl) & CNTP_CTL_ENABLE)) + +static uint64_t cnthctl_el2_reg; +static uint32_t tmr_frq; + +#define timer_condition_met(ctl) ((ctl) & CNTP_CTL_ISTATUS) + +static void vtimer_schedule_irq(struct hypctx *hypctx, bool phys); + +static int +vtimer_virtual_timer_intr(void *arg) +{ + struct hypctx *hypctx; + uint64_t cntpct_el0; + uint32_t cntv_ctl; + + hypctx = arm64_get_active_vcpu(); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + + if (!hypctx) { + /* vm_destroy() was called. */ + eprintf("No active vcpu\n"); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + goto out; + } + if (!timer_enabled(cntv_ctl)) { + eprintf("Timer not enabled\n"); + goto out; + } + if (!timer_condition_met(cntv_ctl)) { + eprintf("Timer condition not met\n"); + goto out; + } + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hypctx->hyp->vtimer.cntvoff_el2; + if (hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 < cntpct_el0) + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + GT_VIRT_IRQ, true); + + cntv_ctl = hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0; + +out: + /* + * Disable the timer interrupt. This will prevent the interrupt from + * being reasserted as soon as we exit the handler and getting stuck + * in an infinite loop. + * + * This is safe to do because the guest disabled the timer, and then + * enables it as part of the interrupt handling routine. + */ + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + + return (FILTER_HANDLED); +} + +int +vtimer_init(uint64_t cnthctl_el2) +{ + cnthctl_el2_reg = cnthctl_el2; + /* + * The guest *MUST* use the same timer frequency as the host. The + * register CNTFRQ_EL0 is accessible to the guest and a different value + * in the guest dts file might have unforseen consequences. + */ + tmr_frq = READ_SPECIALREG(cntfrq_el0); + + return (0); +} + +void +vtimer_vminit(struct hyp *hyp) +{ + uint64_t now; + + /* + * Configure the Counter-timer Hypervisor Control Register for the VM. + * + * CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 from EL1 + * CNTHCTL_EL1PCTEN: trap access to CNTPCT_EL0 + */ + hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg & ~CNTHCTL_EL1PCEN; + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_EL1PCTEN; + + now = READ_SPECIALREG(cntpct_el0); + hyp->vtimer.cntvoff_el2 = now; + + return; +} + +void +vtimer_cpuinit(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + /* + * Configure physical timer interrupts for the VCPU. + * + * CNTP_CTL_IMASK: mask interrupts + * ~CNTP_CTL_ENABLE: disable the timer + */ + vtimer_cpu->phys_timer.cntx_ctl_el0 = CNTP_CTL_IMASK & ~CNTP_CTL_ENABLE; + + mtx_init(&vtimer_cpu->phys_timer.mtx, "vtimer phys callout mutex", NULL, + MTX_DEF); + callout_init_mtx(&vtimer_cpu->phys_timer.callout, + &vtimer_cpu->phys_timer.mtx, 0); + vtimer_cpu->phys_timer.irqid = GT_PHYS_NS_IRQ; + + mtx_init(&vtimer_cpu->virt_timer.mtx, "vtimer virt callout mutex", NULL, + MTX_DEF); + callout_init_mtx(&vtimer_cpu->virt_timer.callout, + &vtimer_cpu->virt_timer.mtx, 0); + vtimer_cpu->virt_timer.irqid = GT_VIRT_IRQ; +} + +void +vtimer_cpucleanup(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + callout_drain(&vtimer_cpu->phys_timer.callout); + callout_drain(&vtimer_cpu->virt_timer.callout); + mtx_destroy(&vtimer_cpu->phys_timer.mtx); + mtx_destroy(&vtimer_cpu->virt_timer.mtx); +} + +void +vtimer_vmcleanup(struct hyp *hyp) +{ + struct hypctx *hypctx; + uint32_t cntv_ctl; + + hypctx = arm64_get_active_vcpu(); + if (!hypctx) { + /* The active VM was destroyed, stop the timer. */ + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + } +} + +void +vtimer_cleanup(void) +{ +} + +void +vtimer_sync_hwstate(struct hypctx *hypctx) +{ + struct vtimer_timer *timer; + uint64_t cntpct_el0; + + timer = &hypctx->vtimer_cpu.virt_timer; + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hypctx->hyp->vtimer.cntvoff_el2; + if (!timer_enabled(timer->cntx_ctl_el0)) { + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, false); + } else if (timer->cntx_cval_el0 < cntpct_el0) { + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, true); + } else { + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, false); + vtimer_schedule_irq(hypctx, false); + } +} + +static void +vtimer_inject_irq_callout_phys(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + hypctx->vtimer_cpu.phys_timer.irqid, true); +} + +static void +vtimer_inject_irq_callout_virt(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + hypctx->vtimer_cpu.virt_timer.irqid, true); +} + +static void +vtimer_schedule_irq(struct hypctx *hypctx, bool phys) +{ + sbintime_t time; + struct vtimer_timer *timer; + uint64_t cntpct_el0; + uint64_t diff; + + if (phys) + timer = &hypctx->vtimer_cpu.phys_timer; + else + timer = &hypctx->vtimer_cpu.virt_timer; + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hypctx->hyp->vtimer.cntvoff_el2; + if (timer->cntx_cval_el0 < cntpct_el0) { + /* Timer set in the past, trigger interrupt */ + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, true); + } else { + diff = timer->cntx_cval_el0 - cntpct_el0; + time = diff * SBT_1S / tmr_frq; + if (phys) + callout_reset_sbt(&timer->callout, time, 0, + vtimer_inject_irq_callout_phys, hypctx, 0); + else + callout_reset_sbt(&timer->callout, time, 0, + vtimer_inject_irq_callout_virt, hypctx, 0); + } +} + +static void +vtimer_remove_irq(struct hypctx *hypctx, struct vcpu *vcpu) +{ + struct vtimer_cpu *vtimer_cpu; + struct vtimer_timer *timer; + + vtimer_cpu = &hypctx->vtimer_cpu; + timer = &vtimer_cpu->phys_timer; + + callout_drain(&timer->callout); + /* + * The interrupt needs to be deactivated here regardless of the callout + * function having been executed. The timer interrupt can be masked with + * the CNTP_CTL_EL0.IMASK bit instead of reading the IAR register. + * Masking the interrupt doesn't remove it from the list registers. + */ + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(vcpu), timer->irqid, false); +} + +/* + * Timer emulation functions. + * + * The guest should use the virtual timer, however some software, e.g. u-boot, + * used the physical timer. Emulate this in software for the guest to use. + * + * Adjust for cntvoff_el2 so the physical and virtual timers are at similar + * times. This simplifies interrupt handling in the virtual timer as the + * adjustment will have already happened. + */ + +int +vtimer_phys_ctl_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + if (vtimer_cpu->phys_timer.cntx_cval_el0 < cntpct_el0) + /* Timer condition met */ + *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 | CNTP_CTL_ISTATUS; + else + *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 & ~CNTP_CTL_ISTATUS; + + return (0); +} + +int +vtimer_phys_ctl_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t ctl_el0; + bool timer_toggled_on; + + hypctx = vcpu_get_cookie(vcpu); + vtimer_cpu = &hypctx->vtimer_cpu; + + timer_toggled_on = false; + ctl_el0 = vtimer_cpu->phys_timer.cntx_ctl_el0; + + if (!timer_enabled(ctl_el0) && timer_enabled(wval)) + timer_toggled_on = true; + else if (timer_enabled(ctl_el0) && !timer_enabled(wval)) + vtimer_remove_irq(hypctx, vcpu); + + vtimer_cpu->phys_timer.cntx_ctl_el0 = wval; + + if (timer_toggled_on) + vtimer_schedule_irq(hypctx, true); + + return (0); +} + +int +vtimer_phys_cnt_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct vm *vm; + struct hyp *hyp; + + vm = vcpu_vm(vcpu); + hyp = vm_get_cookie(vm); + *rval = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + return (0); +} + +int +vtimer_phys_cnt_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + return (0); +} + +int +vtimer_phys_cval_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hypctx = vcpu_get_cookie(vcpu); + vtimer_cpu = &hypctx->vtimer_cpu; + + *rval = vtimer_cpu->phys_timer.cntx_cval_el0; + + return (0); +} + +int +vtimer_phys_cval_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hypctx = vcpu_get_cookie(vcpu); + vtimer_cpu = &hypctx->vtimer_cpu; + + vtimer_cpu->phys_timer.cntx_cval_el0 = wval; + + vtimer_remove_irq(hypctx, vcpu); + if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { + vtimer_schedule_irq(hypctx, true); + } + + return (0); +} + +int +vtimer_phys_tval_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntpct_el0; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vtimer_cpu = &hypctx->vtimer_cpu; + + if (!(vtimer_cpu->phys_timer.cntx_ctl_el0 & CNTP_CTL_ENABLE)) { + /* + * ARMv8 Architecture Manual, p. D7-2702: the result of reading + * TVAL when the timer is disabled is UNKNOWN. I have chosen to + * return the maximum value possible on 32 bits which means the + * timer will fire very far into the future. + */ + *rval = (uint32_t)RES1; + } else { + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hyp->vtimer.cntvoff_el2; + *rval = vtimer_cpu->phys_timer.cntx_cval_el0 - cntpct_el0; + } + + return (0); +} + +int +vtimer_phys_tval_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + vtimer_cpu->phys_timer.cntx_cval_el0 = (int32_t)wval + cntpct_el0; + + vtimer_remove_irq(hypctx, vcpu); + if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { + vtimer_schedule_irq(hypctx, true); + } + + return (0); +} + +struct vtimer_softc { + struct resource *res; + void *ihl; + int rid; +}; + +static int +vtimer_probe(device_t dev) +{ + device_set_desc(dev, "Virtual timer"); + return (BUS_PROBE_DEFAULT); +} + +static int +vtimer_attach(device_t dev) +{ + struct vtimer_softc *sc; + + sc = device_get_softc(dev); + + sc->rid = 0; + sc->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->rid, RF_ACTIVE); + if (sc->res == NULL) + return (ENXIO); + + bus_setup_intr(dev, sc->res, INTR_TYPE_CLK, vtimer_virtual_timer_intr, + NULL, NULL, &sc->ihl); + + return (0); +} + +static device_method_t vtimer_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vtimer_probe), + DEVMETHOD(device_attach, vtimer_attach), + + /* End */ + DEVMETHOD_END +}; + +DEFINE_CLASS_0(vtimer, vtimer_driver, vtimer_methods, + sizeof(struct vtimer_softc)); + +DRIVER_MODULE(vtimer, generic_timer, vtimer_driver, 0, 0); diff --git a/sys/arm64/vmm/io/vtimer.h b/sys/arm64/vmm/io/vtimer.h new file mode 100644 index 000000000000..71a20344d05e --- /dev/null +++ b/sys/arm64/vmm/io/vtimer.h @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VTIMER_H_ +#define _VMM_VTIMER_H_ + +#define GT_PHYS_NS_IRQ 30 +#define GT_VIRT_IRQ 27 + +struct hyp; +struct hypctx; + +struct vtimer { + uint64_t cnthctl_el2; + uint64_t cntvoff_el2; +}; + +struct vtimer_timer { + struct callout callout; + struct mtx mtx; + + uint32_t irqid; + + /* + * These registers are either emulated for the physical timer, or + * the guest has full access to them for the virtual timer. + + * CNTx_CTL_EL0: Counter-timer Timer Control Register + * CNTx_CVAL_EL0: Counter-timer Timer CompareValue Register + */ + uint64_t cntx_cval_el0; + uint64_t cntx_ctl_el0; +}; + +struct vtimer_cpu { + struct vtimer_timer phys_timer; + struct vtimer_timer virt_timer; + + uint32_t cntkctl_el1; +}; + +int vtimer_init(uint64_t cnthctl_el2); +void vtimer_vminit(struct hyp *); +void vtimer_cpuinit(struct hypctx *); +void vtimer_cpucleanup(struct hypctx *); +void vtimer_vmcleanup(struct hyp *); +void vtimer_cleanup(void); +void vtimer_sync_hwstate(struct hypctx *hypctx); + +int vtimer_phys_ctl_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_ctl_write(struct vcpu *vcpu, uint64_t wval, void *arg); +int vtimer_phys_cnt_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_cnt_write(struct vcpu *vcpu, uint64_t wval, void *arg); +int vtimer_phys_cval_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_cval_write(struct vcpu *vcpu, uint64_t wval, void *arg); +int vtimer_phys_tval_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_tval_write(struct vcpu *vcpu, uint64_t wval, void *arg); +#endif diff --git a/sys/arm64/vmm/mmu.h b/sys/arm64/vmm/mmu.h new file mode 100644 index 000000000000..da6aff721055 --- /dev/null +++ b/sys/arm64/vmm/mmu.h @@ -0,0 +1,52 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei <alexandru.elisei@gmail.com> + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MMU_H_ +#define _VMM_MMU_H_ + +#include <machine/machdep.h> +#include <machine/vmparam.h> +#include <machine/vmm.h> + +#include "hyp.h" + +extern char vmm_hyp_code; +extern char vmm_hyp_code_end; + +extern char _vmm_start; +extern char _vmm_end; + +bool vmmpmap_init(void); +void vmmpmap_fini(void); +uint64_t vmmpmap_to_ttbr0(void); +bool vmmpmap_enter(vm_offset_t, vm_size_t, vm_paddr_t, vm_prot_t); +void vmmpmap_remove(vm_offset_t, vm_size_t, bool); + +#endif diff --git a/sys/arm64/vmm/reset.h b/sys/arm64/vmm/reset.h new file mode 100644 index 000000000000..0a5a7f2906a3 --- /dev/null +++ b/sys/arm64/vmm/reset.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 Alexandru Elisei <alexandru.elisei@gmail.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_RESET_H_ +#define _VMM_RESET_H_ + +void reset_vm_el01_regs(void *vcpu); +void reset_vm_el2_regs(void *vcpu); + +#endif diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c new file mode 100644 index 000000000000..2685e5869b4f --- /dev/null +++ b/sys/arm64/vmm/vmm.c @@ -0,0 +1,1803 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuset.h> +#include <sys/kernel.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/vm_param.h> + +#include <machine/armreg.h> +#include <machine/cpu.h> +#include <machine/fpu.h> +#include <machine/machdep.h> +#include <machine/pcb.h> +#include <machine/smp.h> +#include <machine/vm.h> +#include <machine/vmparam.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> + +#include <dev/pci/pcireg.h> + +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "arm64.h" +#include "mmu.h" + +#include "io/vgic.h" +#include "io/vtimer.h" + +struct vcpu { + int flags; + enum vcpu_state state; + struct mtx mtx; + int hostcpu; /* host cpuid this vcpu last ran on */ + int vcpuid; + void *stats; + struct vm_exit exitinfo; + uint64_t nextpc; /* (x) next instruction to execute */ + struct vm *vm; /* (o) */ + void *cookie; /* (i) cpu-specific data */ + struct vfpstate *guestfpu; /* (a,i) guest fpu state */ +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + uint64_t gpa; + size_t len; + bool wired; + bool sysmem; + vm_object_t object; +}; +#define VM_MAX_MEMSEGS 3 + +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 4 + +struct vmm_mmio_region { + uint64_t start; + uint64_t end; + mem_region_read_t read; + mem_region_write_t write; +}; +#define VM_MAX_MMIO_REGIONS 4 + +struct vmm_special_reg { + uint32_t esr_iss; + uint32_t esr_mask; + reg_read_t reg_read; + reg_write_t reg_write; + void *arg; +}; +#define VM_MAX_SPECIAL_REGS 16 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu **vcpu; /* (i) guest vcpus */ + struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; + /* (o) guest MMIO regions */ + struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS]; + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ + struct sx mem_segs_lock; /* (o) */ + struct sx vcpus_init_lock; /* (o) */ +}; + +static bool vmm_initialized = false; + +static int vm_handle_wfi(struct vcpu *vcpu, + struct vm_exit *vme, bool *retu); + +static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +struct vmm_regs { + uint64_t id_aa64afr0; + uint64_t id_aa64afr1; + uint64_t id_aa64dfr0; + uint64_t id_aa64dfr1; + uint64_t id_aa64isar0; + uint64_t id_aa64isar1; + uint64_t id_aa64isar2; + uint64_t id_aa64mmfr0; + uint64_t id_aa64mmfr1; + uint64_t id_aa64mmfr2; + uint64_t id_aa64pfr0; + uint64_t id_aa64pfr1; +}; + +static const struct vmm_regs vmm_arch_regs_masks = { + .id_aa64dfr0 = + ID_AA64DFR0_CTX_CMPs_MASK | + ID_AA64DFR0_WRPs_MASK | + ID_AA64DFR0_BRPs_MASK | + ID_AA64DFR0_PMUVer_3 | + ID_AA64DFR0_DebugVer_8, + .id_aa64isar0 = + ID_AA64ISAR0_TLB_TLBIOSR | + ID_AA64ISAR0_SHA3_IMPL | + ID_AA64ISAR0_RDM_IMPL | + ID_AA64ISAR0_Atomic_IMPL | + ID_AA64ISAR0_CRC32_BASE | + ID_AA64ISAR0_SHA2_512 | + ID_AA64ISAR0_SHA1_BASE | + ID_AA64ISAR0_AES_PMULL, + .id_aa64mmfr0 = + ID_AA64MMFR0_TGran4_IMPL | + ID_AA64MMFR0_TGran64_IMPL | + ID_AA64MMFR0_TGran16_IMPL | + ID_AA64MMFR0_ASIDBits_16 | + ID_AA64MMFR0_PARange_4P, + .id_aa64mmfr1 = + ID_AA64MMFR1_SpecSEI_IMPL | + ID_AA64MMFR1_PAN_ATS1E1 | + ID_AA64MMFR1_HAFDBS_AF, + .id_aa64pfr0 = + ID_AA64PFR0_GIC_CPUIF_NONE | + ID_AA64PFR0_AdvSIMD_HP | + ID_AA64PFR0_FP_HP | + ID_AA64PFR0_EL3_64 | + ID_AA64PFR0_EL2_64 | + ID_AA64PFR0_EL1_64 | + ID_AA64PFR0_EL0_64, +}; + +/* Host registers masked by vmm_arch_regs_masks. */ +static struct vmm_regs vmm_arch_regs; + +u_int vm_maxcpu; +SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &vm_maxcpu, 0, "Maximum number of vCPUs"); + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu); + +/* + * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this + * is a safe value for now. + */ +#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) + +static int +vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks) +{ +#define _FETCH_KERN_REG(reg, field) do { \ + regs->field = vmm_arch_regs_masks.field; \ + if (!get_kernel_reg_masked(reg, ®s->field, masks->field)) \ + regs->field = 0; \ +} while (0) + _FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0); + _FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1); + _FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0); + _FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1); + _FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0); + _FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1); + _FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2); + _FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0); + _FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1); + _FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2); + _FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0); + _FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1); +#undef _FETCH_KERN_REG + return (0); +} + +static void +vcpu_cleanup(struct vcpu *vcpu, bool destroy) +{ + vmmops_vcpu_cleanup(vcpu->cookie); + vcpu->cookie = NULL; + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + vcpu_lock_destroy(vcpu); + } +} + +static struct vcpu * +vcpu_alloc(struct vm *vm, int vcpu_id) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_alloc: invalid vcpu %d", vcpu_id)); + + vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO); + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; + vcpu->vm = vm; + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + return (vcpu); +} + +static void +vcpu_init(struct vcpu *vcpu) +{ + vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); + MPASS(vcpu->cookie != NULL); + fpu_save_area_reset(vcpu->guestfpu); + vmm_stat_init(vcpu->stats); +} + +struct vm_exit * +vm_exitinfo(struct vcpu *vcpu) +{ + return (&vcpu->exitinfo); +} + +static int +vmm_init(void) +{ + int error; + + vm_maxcpu = mp_ncpus; + TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); + + if (vm_maxcpu > VM_MAXCPU) { + printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); + vm_maxcpu = VM_MAXCPU; + } + if (vm_maxcpu == 0) + vm_maxcpu = 1; + + error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks); + if (error != 0) + return (error); + + return (vmmops_modinit(0)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + /* TODO: if (vmm_is_hw_supported()) { */ + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = true; + break; + case MOD_UNLOAD: + /* TODO: if (vmm_is_hw_supported()) { */ + error = vmmdev_cleanup(); + if (error == 0 && vmm_initialized) { + error = vmmops_modcleanup(); + if (error) + vmm_initialized = false; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - HYP initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +static void +vm_init(struct vm *vm, bool create) +{ + int i; + + vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + MPASS(vm->cookie != NULL); + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + memset(vm->mmio_region, 0, sizeof(vm->mmio_region)); + memset(vm->special_reg, 0, sizeof(vm->special_reg)); + + if (!create) { + for (i = 0; i < vm->maxcpus; i++) { + if (vm->vcpu[i] != NULL) + vcpu_init(vm->vcpu[i]); + } + } +} + +struct vcpu * +vm_alloc_vcpu(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) + return (NULL); + + /* Some interrupt controllers may have a CPU limit */ + if (vcpuid >= vgic_max_cpu_count(vm->cookie)) + return (NULL); + + vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]); + if (__predict_true(vcpu != NULL)) + return (vcpu); + + sx_xlock(&vm->vcpus_init_lock); + vcpu = vm->vcpu[vcpuid]; + if (vcpu == NULL/* && !vm->dying*/) { + vcpu = vcpu_alloc(vm, vcpuid); + vcpu_init(vcpu); + + /* + * Ensure vCPU is fully created before updating pointer + * to permit unlocked reads above. + */ + atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], + (uintptr_t)vcpu); + } + sx_xunlock(&vm->vcpus_init_lock); + return (vcpu); +} + +void +vm_slock_vcpus(struct vm *vm) +{ + sx_slock(&vm->vcpus_init_lock); +} + +void +vm_unlock_vcpus(struct vm *vm) +{ + sx_unlock(&vm->vcpus_init_lock); +} + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + struct vmspace *vmspace; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vmspace = vmmops_vmspace_alloc(0, 1ul << 39); + if (vmspace == NULL) + return (ENOMEM); + + vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->vmspace = vmspace; + sx_init(&vm->mem_segs_lock, "vm mem_segs"); + sx_init(&vm->vcpus_init_lock, "vm vcpus"); + + vm->sockets = 1; + vm->cores = 1; /* XXX backwards compatibility */ + vm->threads = 1; /* XXX backwards compatibility */ + vm->maxcpus = vm_maxcpu; + + vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM, + M_WAITOK | M_ZERO); + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) +{ + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + /* Ignore maxcpus. */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + return(0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + struct mem_map *mm; + pmap_t pmap __diagused; + int i; + + if (destroy) { + pmap = vmspace_pmap(vm->vmspace); + sched_pin(); + PCPU_SET(curvmpmap, NULL); + sched_unpin(); + CPU_FOREACH(i) { + MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap); + } + } + + vgic_detach_from_vm(vm->cookie); + + for (i = 0; i < vm->maxcpus; i++) { + if (vm->vcpu[i] != NULL) + vcpu_cleanup(vm->vcpu[i], destroy); + } + + vmmops_cleanup(vm->cookie); + + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + if (!destroy) { + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); + } + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + vmmops_vmspace_free(vm->vmspace); + vm->vmspace = NULL; + + for (i = 0; i < vm->maxcpus; i++) + free(vm->vcpu[i], M_VMM); + free(vm->vcpu, M_VMM); + sx_destroy(&vm->vcpus_init_lock); + sx_destroy(&vm->mem_segs_lock); + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VMM); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +void +vm_slock_memsegs(struct vm *vm) +{ + sx_slock(&vm->mem_segs_lock); +} + +void +vm_xlock_memsegs(struct vm *vm) +{ + sx_xlock(&vm->mem_segs_lock); +} + +void +vm_unlock_memsegs(struct vm *vm) +{ + sx_unlock(&vm->mem_segs_lock); +} + +/* + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. + */ +bool +vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) +{ + struct vm *vm = vcpu->vm; + struct mem_map *mm; + int i; + +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vcpu, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ + } + + return (false); +} + +int +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +{ + struct mem_seg *seg; + vm_object_t obj; + + sx_assert(&vm->mem_segs_lock, SX_XLOCKED); + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + if (len == 0 || (len & PAGE_MASK)) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } + + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) + return (ENOMEM); + + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} + +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; + + sx_assert(&vm->mem_segs_lock, SX_LOCKED); + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} + +void +vm_free_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); + } +} + +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) +{ + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } + + if (map == NULL) + return (ENOSPC); + + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); + + if (flags & VM_MEMMAP_F_WIRED) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : + EFAULT); + } + } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + struct mem_map *m; + int i; + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->gpa == gpa && m->len == len) { + vm_free_memmap(vm, i); + return (0); + } + } + + return (EINVAL); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error __diagused; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +int +vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault) +{ + + vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault); + return (0); +} + +static int +vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + *rval = 0; + return (0); +} + +static int +vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + *rval = *(uint64_t *)arg; + return (0); +} + +static int +vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + return (0); +} + +static const struct vmm_special_reg vmm_special_regs[] = { +#define SPECIAL_REG(_reg, _read, _write) \ + { \ + .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ + .esr_mask = ISS_MSR_REG_MASK, \ + .reg_read = (_read), \ + .reg_write = (_write), \ + .arg = NULL, \ + } +#define ID_SPECIAL_REG(_reg, _name) \ + { \ + .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ + .esr_mask = ISS_MSR_REG_MASK, \ + .reg_read = vmm_reg_read_arg, \ + .reg_write = vmm_reg_wi, \ + .arg = &(vmm_arch_regs._name), \ + } + + /* ID registers */ + ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0), + ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0), + ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0), + ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0), + ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1), + + /* + * All other ID registers are read as zero. + * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space. + */ + { + .esr_iss = (3 << ISS_MSR_OP0_SHIFT) | + (0 << ISS_MSR_OP1_SHIFT) | + (0 << ISS_MSR_CRn_SHIFT) | + (0 << ISS_MSR_CRm_SHIFT), + .esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK | + ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT), + .reg_read = vmm_reg_raz, + .reg_write = vmm_reg_wi, + .arg = NULL, + }, + + /* Counter physical registers */ + SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write), + SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read, + vtimer_phys_cval_write), + SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read, + vtimer_phys_tval_write), + SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write), +#undef SPECIAL_REG +}; + +void +vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask, + reg_read_t reg_read, reg_write_t reg_write, void *arg) +{ + int i; + + for (i = 0; i < nitems(vm->special_reg); i++) { + if (vm->special_reg[i].esr_iss == 0 && + vm->special_reg[i].esr_mask == 0) { + vm->special_reg[i].esr_iss = iss; + vm->special_reg[i].esr_mask = mask; + vm->special_reg[i].reg_read = reg_read; + vm->special_reg[i].reg_write = reg_write; + vm->special_reg[i].arg = arg; + return; + } + } + + panic("%s: No free special register slot", __func__); +} + +void +vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask) +{ + int i; + + for (i = 0; i < nitems(vm->special_reg); i++) { + if (vm->special_reg[i].esr_iss == iss && + vm->special_reg[i].esr_mask == mask) { + memset(&vm->special_reg[i], 0, + sizeof(vm->special_reg[i])); + return; + } + } + + panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss, + mask); +} + +static int +vm_handle_reg_emul(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm; + struct vm_exit *vme; + struct vre *vre; + int i, rv; + + vm = vcpu->vm; + vme = &vcpu->exitinfo; + vre = &vme->u.reg_emul.vre; + + for (i = 0; i < nitems(vm->special_reg); i++) { + if (vm->special_reg[i].esr_iss == 0 && + vm->special_reg[i].esr_mask == 0) + continue; + + if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) == + vm->special_reg[i].esr_iss) { + rv = vmm_emulate_register(vcpu, vre, + vm->special_reg[i].reg_read, + vm->special_reg[i].reg_write, + vm->special_reg[i].arg); + if (rv == 0) { + *retu = false; + } + return (rv); + } + } + for (i = 0; i < nitems(vmm_special_regs); i++) { + if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) == + vmm_special_regs[i].esr_iss) { + rv = vmm_emulate_register(vcpu, vre, + vmm_special_regs[i].reg_read, + vmm_special_regs[i].reg_write, + vmm_special_regs[i].arg); + if (rv == 0) { + *retu = false; + } + return (rv); + } + } + + + *retu = true; + return (0); +} + +void +vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, + mem_region_read_t mmio_read, mem_region_write_t mmio_write) +{ + int i; + + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start == 0 && + vm->mmio_region[i].end == 0) { + vm->mmio_region[i].start = start; + vm->mmio_region[i].end = start + size; + vm->mmio_region[i].read = mmio_read; + vm->mmio_region[i].write = mmio_write; + return; + } + } + + panic("%s: No free MMIO region", __func__); +} + +void +vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size) +{ + int i; + + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start == start && + vm->mmio_region[i].end == start + size) { + memset(&vm->mmio_region[i], 0, + sizeof(vm->mmio_region[i])); + return; + } + } + + panic("%s: Invalid MMIO region: %lx - %lx", __func__, start, + start + size); +} + +static int +vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm; + struct vm_exit *vme; + struct vie *vie; + struct hyp *hyp; + uint64_t fault_ipa; + struct vm_guest_paging *paging; + struct vmm_mmio_region *vmr; + int error, i; + + vm = vcpu->vm; + hyp = vm->cookie; + if (!hyp->vgic_attached) + goto out_user; + + vme = &vcpu->exitinfo; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + + fault_ipa = vme->u.inst_emul.gpa; + + vmr = NULL; + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start <= fault_ipa && + vm->mmio_region[i].end > fault_ipa) { + vmr = &vm->mmio_region[i]; + break; + } + } + if (vmr == NULL) + goto out_user; + + error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging, + vmr->read, vmr->write, retu); + return (error); + +out_user: + *retu = true; + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm_vcpu(vm, i)); + } + + return (0); +} + +void +vm_exit_suspended(struct vcpu *vcpu, uint64_t pc) +{ + struct vm *vm = vcpu->vm; + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vcpu); + vmexit->pc = pc; + vmexit->inst_length = 4; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_debug(struct vcpu *vcpu, uint64_t pc) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vcpu); + vmexit->pc = pc; + vmexit->inst_length = 4; + vmexit->exitcode = VM_EXITCODE_DEBUG; +} + +int +vm_activate_cpu(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + + if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) + return (EBUSY); + + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); + return (0); + +} + +int +vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) +{ + if (vcpu == NULL) { + vm->debug_cpus = vm->active_cpus; + for (int i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm_vcpu(vm, i)); + } + } else { + if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + vcpu_notify_event(vcpu); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) +{ + + if (vcpu == NULL) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + } + return (0); +} + +int +vcpu_debugged(struct vcpu *vcpu) +{ + + return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + + +void * +vcpu_stats(struct vcpu *vcpu) +{ + + return (vcpu->stats); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +static void +vcpu_notify_event_locked(struct vcpu *vcpu) +{ + int hostcpu; + + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + ipi_cpu(hostcpu, vmm_ipinum); + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) + wakeup_one(vcpu); + } +} + +void +vcpu_notify_event(struct vcpu *vcpu) +{ + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu); + vcpu_unlock(vcpu); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + + /* flush host state to the pcb */ + vfp_save_state(curthread, curthread->td_pcb); + /* Ensure the VFP state will be re-loaded when exiting the guest */ + PCPU_SET(fpcurthread, NULL); + + /* restore guest FPU state */ + vfp_enable(); + vfp_restore(vcpu->guestfpu); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + vfp_disable(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) != + CPACR_FPEN_TRAP_ALL1) + panic("VFP not enabled in host!"); + + /* save guest FPU state */ + vfp_enable(); + vfp_store(vcpu->guestfpu); + vfp_disable(); + + KASSERT(PCPU_GET(fpcurthread) == NULL, + ("%s: fpcurthread set with guest registers", __func__)); +} +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) { + vcpu_notify_event_locked(vcpu); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +static void +vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +int +vm_get_capability(struct vcpu *vcpu, int type, int *retval) +{ + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (vmmops_getcap(vcpu->cookie, type, retval)); +} + +int +vm_set_capability(struct vcpu *vcpu, int type, int val) +{ + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (vmmops_setcap(vcpu->cookie, type, val)); +} + +struct vm * +vcpu_vm(struct vcpu *vcpu) +{ + return (vcpu->vm); +} + +int +vcpu_vcpuid(struct vcpu *vcpu) +{ + return (vcpu->vcpuid); +} + +void * +vcpu_get_cookie(struct vcpu *vcpu) +{ + return (vcpu->cookie); +} + +struct vcpu * +vm_vcpu(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid]); +} + +int +vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) +{ + int error; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vcpu *vcpu, int *hostcpu) +{ + enum vcpu_state state; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +static void * +_vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + int i, count, pageoff; + struct mem_map *mm; + vm_page_t m; + + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } +} + +void * +vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ +#ifdef INVARIANTS + /* + * The current vcpu should be frozen to ensure 'vm_memmap[]' + * stability. + */ + int state = vcpu_get_state(vcpu, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); +#endif + return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie)); +} + +void * +vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + sx_assert(&vm->mem_segs_lock, SX_LOCKED); + return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie)); +} + +void +vm_gpa_release(void *cookie) +{ + vm_page_t m = cookie; + + vm_page_unwire(m, PQ_ACTIVE); +} + +int +vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) +{ + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (vmmops_getreg(vcpu->cookie, reg, retval)); +} + +int +vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) +{ + int error; + + if (reg >= VM_REG_LAST) + return (EINVAL); + error = vmmops_setreg(vcpu->cookie, reg, val); + if (error || reg != VM_REG_GUEST_PC) + return (error); + + vcpu->nextpc = val; + + return (0); +} + +void * +vm_get_cookie(struct vm *vm) +{ + return (vm->cookie); +} + +int +vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far) +{ + return (vmmops_exception(vcpu->cookie, esr, far)); +} + +int +vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr) +{ + return (vgic_attach_to_vm(vm->cookie, descr)); +} + +int +vm_assert_irq(struct vm *vm, uint32_t irq) +{ + return (vgic_inject_irq(vm->cookie, -1, irq, true)); +} + +int +vm_deassert_irq(struct vm *vm, uint32_t irq) +{ + return (vgic_inject_irq(vm->cookie, -1, irq, false)); +} + +int +vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, + int func) +{ + /* TODO: Should we raise an SError? */ + return (vgic_inject_msi(vm->cookie, msg, addr)); +} + +static int +vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) +{ + struct hypctx *hypctx; + int i; + + hypctx = vcpu_get_cookie(vcpu); + + if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0) + return (1); + + vme->exitcode = VM_EXITCODE_SMCCC; + vme->u.smccc_call.func_id = hypctx->tf.tf_x[0]; + for (i = 0; i < nitems(vme->u.smccc_call.args); i++) + vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1]; + + *retu = true; + return (0); +} + +static int +vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) +{ + vcpu_lock(vcpu); + while (1) { + if (vgic_has_pending_irq(vcpu->cookie)) + break; + + if (vcpu_should_yield(vcpu)) + break; + + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + *retu = false; + return (0); +} + +static int +vm_handle_paging(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm = vcpu->vm; + struct vm_exit *vme; + struct vm_map *map; + uint64_t addr, esr; + pmap_t pmap; + int ftype, rv; + + vme = &vcpu->exitinfo; + + pmap = vmspace_pmap(vcpu->vm->vmspace); + addr = vme->u.paging.gpa; + esr = vme->u.paging.esr; + + /* The page exists, but the page table needs to be updated. */ + if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS) + return (0); + + switch (ESR_ELx_EXCEPTION(esr)) { + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE; + break; + default: + panic("%s: Invalid exception (esr = %lx)", __func__, esr); + } + + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); + if (rv != KERN_SUCCESS) + return (EFAULT); + + return (0); +} + +int +vm_run(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + struct vm_eventinfo evinfo; + int error, vcpuid; + struct vm_exit *vme; + bool retu; + pmap_t pmap; + + vcpuid = vcpu->vcpuid; + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) + return (EINVAL); + + pmap = vmspace_pmap(vm->vmspace); + vme = &vcpu->exitinfo; + evinfo.rptr = NULL; + evinfo.sptr = &vm->suspend; + evinfo.iptr = NULL; +restart: + critical_enter(); + + restore_guest_fpustate(vcpu); + + vcpu_require_state(vcpu, VCPU_RUNNING); + error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo); + vcpu_require_state(vcpu, VCPU_FROZEN); + + save_guest_fpustate(vcpu); + + critical_exit(); + + if (error == 0) { + retu = false; + switch (vme->exitcode) { + case VM_EXITCODE_INST_EMUL: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_inst_emul(vcpu, &retu); + break; + + case VM_EXITCODE_REG_EMUL: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_reg_emul(vcpu, &retu); + break; + + case VM_EXITCODE_HVC: + /* + * The HVC instruction saves the address for the + * next instruction as the return address. + */ + vcpu->nextpc = vme->pc; + /* + * The PSCI call can change the exit information in the + * case of suspend/reset/poweroff/cpu off/cpu on. + */ + error = vm_handle_smccc_call(vcpu, vme, &retu); + break; + + case VM_EXITCODE_WFI: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_wfi(vcpu, vme, &retu); + break; + + case VM_EXITCODE_PAGING: + vcpu->nextpc = vme->pc; + error = vm_handle_paging(vcpu, &retu); + break; + + default: + /* Handle in userland */ + vcpu->nextpc = vme->pc; + retu = true; + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + return (error); +} diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c new file mode 100644 index 000000000000..e71761f9ccef --- /dev/null +++ b/sys/arm64/vmm/vmm_arm64.c @@ -0,0 +1,1337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/vmem.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_param.h> + +#include <machine/armreg.h> +#include <machine/vm.h> +#include <machine/cpufunc.h> +#include <machine/cpu.h> +#include <machine/machdep.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/atomic.h> +#include <machine/hypervisor.h> +#include <machine/pmap.h> + +#include "mmu.h" +#include "arm64.h" +#include "hyp.h" +#include "reset.h" +#include "io/vgic.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" +#include "vmm_stat.h" + +#define HANDLED 1 +#define UNHANDLED 0 + +/* Number of bits in an EL2 virtual address */ +#define EL2_VIRT_BITS 48 +CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS); + +/* TODO: Move the host hypctx off the stack */ +#define VMM_STACK_PAGES 4 +#define VMM_STACK_SIZE (VMM_STACK_PAGES * PAGE_SIZE) + +static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits; + +/* Register values passed to arm_setup_vectors to set in the hypervisor */ +struct vmm_init_regs { + uint64_t tcr_el2; + uint64_t vtcr_el2; +}; + +MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP"); + +extern char hyp_init_vectors[]; +extern char hyp_vectors[]; +extern char hyp_stub_vectors[]; + +static vm_paddr_t hyp_code_base; +static size_t hyp_code_len; + +static char *stack[MAXCPU]; +static vm_offset_t stack_hyp_va[MAXCPU]; + +static vmem_t *el2_mem_alloc; + +static void arm_setup_vectors(void *arg); +static void vmm_pmap_clean_stage2_tlbi(void); +static void vmm_pmap_invalidate_range(uint64_t, vm_offset_t, vm_offset_t, bool); +static void vmm_pmap_invalidate_all(uint64_t); + +DPCPU_DEFINE_STATIC(struct hypctx *, vcpu); + +static inline void +arm64_set_active_vcpu(struct hypctx *hypctx) +{ + DPCPU_SET(vcpu, hypctx); +} + +struct hypctx * +arm64_get_active_vcpu(void) +{ + return (DPCPU_GET(vcpu)); +} + +static void +arm_setup_vectors(void *arg) +{ + struct vmm_init_regs *el2_regs; + uintptr_t stack_top; + uint32_t sctlr_el2; + register_t daif; + + el2_regs = arg; + arm64_set_active_vcpu(NULL); + + daif = intr_disable(); + + /* + * Install the temporary vectors which will be responsible for + * initializing the VMM when we next trap into EL2. + * + * x0: the exception vector table responsible for hypervisor + * initialization on the next call. + */ + vmm_call_hyp(vtophys(&vmm_hyp_code)); + + /* Create and map the hypervisor stack */ + stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE; + + /* + * Configure the system control register for EL2: + * + * SCTLR_EL2_M: MMU on + * SCTLR_EL2_C: Data cacheability not affected + * SCTLR_EL2_I: Instruction cacheability not affected + * SCTLR_EL2_A: Instruction alignment check + * SCTLR_EL2_SA: Stack pointer alignment check + * SCTLR_EL2_WXN: Treat writable memory as execute never + * ~SCTLR_EL2_EE: Data accesses are little-endian + */ + sctlr_el2 = SCTLR_EL2_RES1; + sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I; + sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA; + sctlr_el2 |= SCTLR_EL2_WXN; + sctlr_el2 &= ~SCTLR_EL2_EE; + + /* Special call to initialize EL2 */ + vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2, + sctlr_el2, el2_regs->vtcr_el2); + + intr_restore(daif); +} + +static void +arm_teardown_vectors(void *arg) +{ + register_t daif; + + /* + * vmm_cleanup() will disable the MMU. For the next few instructions, + * before the hardware disables the MMU, one of the following is + * possible: + * + * a. The instruction addresses are fetched with the MMU disabled, + * and they must represent the actual physical addresses. This will work + * because we call the vmm_cleanup() function by its physical address. + * + * b. The instruction addresses are fetched using the old translation + * tables. This will work because we have an identity mapping in place + * in the translation tables and vmm_cleanup() is called by its physical + * address. + */ + daif = intr_disable(); + /* TODO: Invalidate the cache */ + vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors)); + intr_restore(daif); + + arm64_set_active_vcpu(NULL); +} + +static uint64_t +vmm_vtcr_el2_sl(u_int levels) +{ +#if PAGE_SIZE == PAGE_SIZE_4K + switch (levels) { + case 2: + return (VTCR_EL2_SL0_4K_LVL2); + case 3: + return (VTCR_EL2_SL0_4K_LVL1); + case 4: + return (VTCR_EL2_SL0_4K_LVL0); + default: + panic("%s: Invalid number of page table levels %u", __func__, + levels); + } +#elif PAGE_SIZE == PAGE_SIZE_16K + switch (levels) { + case 2: + return (VTCR_EL2_SL0_16K_LVL2); + case 3: + return (VTCR_EL2_SL0_16K_LVL1); + case 4: + return (VTCR_EL2_SL0_16K_LVL0); + default: + panic("%s: Invalid number of page table levels %u", __func__, + levels); + } +#else +#error Unsupported page size +#endif +} + +int +vmmops_modinit(int ipinum) +{ + struct vmm_init_regs el2_regs; + vm_offset_t next_hyp_va; + vm_paddr_t vmm_base; + uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field; + uint64_t cnthctl_el2; + register_t daif; + int cpu, i; + bool rv __diagused; + + if (!virt_enabled()) { + printf( + "vmm: Processor doesn't have support for virtualization\n"); + return (ENXIO); + } + + /* TODO: Support VHE */ + if (in_vhe()) { + printf("vmm: VHE is unsupported\n"); + return (ENXIO); + } + + if (!vgic_present()) { + printf("vmm: No vgic found\n"); + return (ENODEV); + } + + if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) { + printf("vmm: Unable to read ID_AA64MMFR0_EL1\n"); + return (ENXIO); + } + pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1); + /* + * Use 3 levels to give us up to 39 bits with 4k pages, or + * 47 bits with 16k pages. + */ + /* TODO: Check the number of levels for 64k pages */ + vmm_pmap_levels = 3; + switch (pa_range_field) { + case ID_AA64MMFR0_PARange_4G: + printf("vmm: Not enough physical address bits\n"); + return (ENXIO); + case ID_AA64MMFR0_PARange_64G: + vmm_virt_bits = 36; +#if PAGE_SIZE == PAGE_SIZE_16K + vmm_pmap_levels = 2; +#endif + break; + default: + vmm_virt_bits = 39; + break; + } + pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT; + + /* Initialise the EL2 MMU */ + if (!vmmpmap_init()) { + printf("vmm: Failed to init the EL2 MMU\n"); + return (ENOMEM); + } + + /* Set up the stage 2 pmap callbacks */ + MPASS(pmap_clean_stage2_tlbi == NULL); + pmap_clean_stage2_tlbi = vmm_pmap_clean_stage2_tlbi; + pmap_stage2_invalidate_range = vmm_pmap_invalidate_range; + pmap_stage2_invalidate_all = vmm_pmap_invalidate_all; + + /* + * Create an allocator for the virtual address space used by EL2. + * EL2 code is identity-mapped; the allocator is used to find space for + * VM structures. + */ + el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, M_WAITOK); + + /* Create the mappings for the hypervisor translation table. */ + hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code); + + /* We need an physical identity mapping for when we activate the MMU */ + hyp_code_base = vmm_base = vtophys(&vmm_hyp_code); + rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base, + VM_PROT_READ | VM_PROT_EXECUTE); + MPASS(rv); + + next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE); + + /* Create a per-CPU hypervisor stack */ + CPU_FOREACH(cpu) { + stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO); + stack_hyp_va[cpu] = next_hyp_va; + + for (i = 0; i < VMM_STACK_PAGES; i++) { + rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i), + PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)), + VM_PROT_READ | VM_PROT_WRITE); + MPASS(rv); + } + next_hyp_va += L2_SIZE; + } + + el2_regs.tcr_el2 = TCR_EL2_RES1; + el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT, + TCR_EL2_PS_52BITS); + el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS); + el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA; +#if PAGE_SIZE == PAGE_SIZE_4K + el2_regs.tcr_el2 |= TCR_EL2_TG0_4K; +#elif PAGE_SIZE == PAGE_SIZE_16K + el2_regs.tcr_el2 |= TCR_EL2_TG0_16K; +#else +#error Unsupported page size +#endif +#ifdef SMP + el2_regs.tcr_el2 |= TCR_EL2_SH0_IS; +#endif + + switch (el2_regs.tcr_el2 & TCR_EL2_PS_MASK) { + case TCR_EL2_PS_32BITS: + vmm_max_ipa_bits = 32; + break; + case TCR_EL2_PS_36BITS: + vmm_max_ipa_bits = 36; + break; + case TCR_EL2_PS_40BITS: + vmm_max_ipa_bits = 40; + break; + case TCR_EL2_PS_42BITS: + vmm_max_ipa_bits = 42; + break; + case TCR_EL2_PS_44BITS: + vmm_max_ipa_bits = 44; + break; + case TCR_EL2_PS_48BITS: + vmm_max_ipa_bits = 48; + break; + case TCR_EL2_PS_52BITS: + default: + vmm_max_ipa_bits = 52; + break; + } + + /* + * Configure the Stage 2 translation control register: + * + * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable + * normal memory + * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable + * normal memory + * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel + * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables + * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner + * shareable + */ + el2_regs.vtcr_el2 = VTCR_EL2_RES1; + el2_regs.vtcr_el2 |= + min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT); + el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA; + el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits); + el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels); +#if PAGE_SIZE == PAGE_SIZE_4K + el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K; +#elif PAGE_SIZE == PAGE_SIZE_16K + el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K; +#else +#error Unsupported page size +#endif +#ifdef SMP + el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS; +#endif + + smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs); + + /* Add memory to the vmem allocator (checking there is space) */ + if (vmm_base > (L2_SIZE + PAGE_SIZE)) { + /* + * Ensure there is an L2 block before the vmm code to check + * for buffer overflows on earlier data. Include the PAGE_SIZE + * of the minimum we can allocate. + */ + vmm_base -= L2_SIZE + PAGE_SIZE; + vmm_base = rounddown2(vmm_base, L2_SIZE); + + /* + * Check there is memory before the vmm code to add. + * + * Reserve the L2 block at address 0 so NULL dereference will + * raise an exception. + */ + if (vmm_base > L2_SIZE) + vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE, + M_WAITOK); + } + + /* + * Add the memory after the stacks. There is most of an L2 block + * between the last stack and the first allocation so this should + * be safe without adding more padding. + */ + if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE) + vmem_add(el2_mem_alloc, next_hyp_va, + HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); + + daif = intr_disable(); + cnthctl_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_CNTHCTL); + intr_restore(daif); + + vgic_init(); + vtimer_init(cnthctl_el2); + + return (0); +} + +int +vmmops_modcleanup(void) +{ + int cpu; + + smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL); + + CPU_FOREACH(cpu) { + vmmpmap_remove(stack_hyp_va[cpu], VMM_STACK_PAGES * PAGE_SIZE, + false); + } + + vmmpmap_remove(hyp_code_base, hyp_code_len, false); + + vtimer_cleanup(); + + vmmpmap_fini(); + + CPU_FOREACH(cpu) + free(stack[cpu], M_HYP); + + pmap_clean_stage2_tlbi = NULL; + pmap_stage2_invalidate_range = NULL; + pmap_stage2_invalidate_all = NULL; + + return (0); +} + +static vm_size_t +el2_hyp_size(struct vm *vm) +{ + return (round_page(sizeof(struct hyp) + + sizeof(struct hypctx *) * vm_get_maxcpus(vm))); +} + +static vm_size_t +el2_hypctx_size(void) +{ + return (round_page(sizeof(struct hypctx))); +} + +static vm_offset_t +el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot) +{ + vmem_addr_t addr; + int err __diagused; + bool rv __diagused; + + err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr); + MPASS(err == 0); + rv = vmmpmap_enter(addr, size, vtophys(data), prot); + MPASS(rv); + + return (addr); +} + +void * +vmmops_init(struct vm *vm, pmap_t pmap) +{ + struct hyp *hyp; + vm_size_t size; + + size = el2_hyp_size(vm); + hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); + + hyp->vm = vm; + hyp->vgic_attached = false; + + vtimer_vminit(hyp); + vgic_vminit(hyp); + + hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size, + VM_PROT_READ | VM_PROT_WRITE); + + return (hyp); +} + +void * +vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) +{ + struct hyp *hyp = vmi; + struct hypctx *hypctx; + vm_size_t size; + + size = el2_hypctx_size(); + hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); + + KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm), + ("%s: Invalid vcpuid %d", __func__, vcpuid)); + hyp->ctx[vcpuid] = hypctx; + + hypctx->hyp = hyp; + hypctx->vcpu = vcpu1; + + reset_vm_el01_regs(hypctx); + reset_vm_el2_regs(hypctx); + + vtimer_cpuinit(hypctx); + vgic_cpuinit(hypctx); + + hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size, + VM_PROT_READ | VM_PROT_WRITE); + + return (hypctx); +} + +static int +arm_vmm_pinit(pmap_t pmap) +{ + + pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels); + return (1); +} + +struct vmspace * +vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + return (vmspace_alloc(min, max, arm_vmm_pinit)); +} + +void +vmmops_vmspace_free(struct vmspace *vmspace) +{ + + pmap_remove_pages(vmspace_pmap(vmspace)); + vmspace_free(vmspace); +} + +static void +vmm_pmap_clean_stage2_tlbi(void) +{ + vmm_call_hyp(HYP_CLEAN_S2_TLBI); +} + +static void +vmm_pmap_invalidate_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, + bool final_only) +{ + MPASS(eva > sva); + vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only); +} + +static void +vmm_pmap_invalidate_all(uint64_t vttbr) +{ + vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr); +} + +static inline void +arm64_print_hyp_regs(struct vm_exit *vme) +{ + printf("esr_el2: 0x%016lx\n", vme->u.hyp.esr_el2); + printf("far_el2: 0x%016lx\n", vme->u.hyp.far_el2); + printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2); + printf("elr_el2: 0x%016lx\n", vme->pc); +} + +static void +arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss, + struct vm_exit *vme_ret) +{ + struct vm_guest_paging *paging; + struct vie *vie; + uint32_t esr_sas, reg_num; + + /* + * Get the page address from HPFAR_EL2. + */ + vme_ret->u.inst_emul.gpa = + HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); + /* Bits [11:0] are the same as bits [11:0] from the virtual address. */ + vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 & + FAR_EL2_HPFAR_PAGE_MASK; + + esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT; + reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT; + + vie = &vme_ret->u.inst_emul.vie; + vie->access_size = 1 << esr_sas; + vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0; + vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ; + vie->reg = reg_num; + + paging = &vme_ret->u.inst_emul.paging; + paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP); + paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP); + paging->tcr_el1 = hypctx->tcr_el1; + paging->tcr2_el1 = hypctx->tcr2_el1; + paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); + if ((hypctx->sctlr_el1 & SCTLR_M) != 0) + paging->flags |= VM_GP_MMU_ENABLED; +} + +static void +arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + uint32_t reg_num; + struct vre *vre; + + /* u.hyp member will be replaced by u.reg_emul */ + vre = &vme_ret->u.reg_emul.vre; + + vre->inst_syndrome = esr_iss; + /* ARMv8 Architecture Manual, p. D7-2273: 1 means read */ + vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE; + reg_num = ISS_MSR_Rt(esr_iss); + vre->reg = reg_num; +} + +void +raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc) +{ + uint64_t esr; + + if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t) + esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT; + else + esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT; + /* Set the bit that changes from insn -> data abort */ + if (dabort) + esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT; + /* Set the IL bit if set by hardware */ + esr |= hypctx->tf.tf_esr & ESR_ELx_IL; + + vmmops_exception(hypctx, esr | fsc, far); +} + +static int +handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret, + pmap_t pmap) +{ + uint64_t gpa; + uint32_t esr_ec, esr_iss; + + esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr); + esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK; + + switch (esr_ec) { + case EXCP_UNKNOWN: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + case EXCP_TRAP_WFI_WFE: + if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */ + vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1); + vme_ret->exitcode = VM_EXITCODE_WFI; + } else { + vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1); + vme_ret->exitcode = VM_EXITCODE_HYP; + } + break; + case EXCP_HVC: + vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1); + vme_ret->exitcode = VM_EXITCODE_HVC; + break; + case EXCP_MSR: + vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1); + arm64_gen_reg_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_REG_EMUL; + break; + + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ? + VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1); + switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_TF_L0: + case ISS_DATA_DFSC_TF_L1: + case ISS_DATA_DFSC_TF_L2: + case ISS_DATA_DFSC_TF_L3: + case ISS_DATA_DFSC_AFF_L1: + case ISS_DATA_DFSC_AFF_L2: + case ISS_DATA_DFSC_AFF_L3: + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); + /* Check the IPA is valid */ + if (gpa >= (1ul << vmm_max_ipa_bits)) { + raise_data_insn_abort(hypctx, + hypctx->exit_info.far_el2, + esr_ec == EXCP_DATA_ABORT_L, + ISS_DATA_DFSC_ASF_L0); + vme_ret->inst_length = 0; + return (HANDLED); + } + + if (vm_mem_allocated(hypctx->vcpu, gpa)) { + vme_ret->exitcode = VM_EXITCODE_PAGING; + vme_ret->inst_length = 0; + vme_ret->u.paging.esr = hypctx->tf.tf_esr; + vme_ret->u.paging.gpa = gpa; + } else if (esr_ec == EXCP_INSN_ABORT_L) { + /* + * Raise an external abort. Device memory is + * not executable + */ + raise_data_insn_abort(hypctx, + hypctx->exit_info.far_el2, false, + ISS_DATA_DFSC_EXT); + vme_ret->inst_length = 0; + return (HANDLED); + } else { + arm64_gen_inst_emul_data(hypctx, esr_iss, + vme_ret); + vme_ret->exitcode = VM_EXITCODE_INST_EMUL; + } + break; + default: + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + break; + + default: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* We don't don't do any instruction emulation here */ + return (UNHANDLED); +} + +static int +arm64_handle_world_switch(struct hypctx *hypctx, int excp_type, + struct vm_exit *vme, pmap_t pmap) +{ + int handled; + + switch (excp_type) { + case EXCP_TYPE_EL1_SYNC: + /* The exit code will be set by handle_el1_sync_excp(). */ + handled = handle_el1_sync_excp(hypctx, vme, pmap); + break; + + case EXCP_TYPE_EL1_IRQ: + case EXCP_TYPE_EL1_FIQ: + /* The host kernel will handle IRQs and FIQs. */ + vmm_stat_incr(hypctx->vcpu, + excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + case EXCP_TYPE_EL1_ERROR: + case EXCP_TYPE_EL2_SYNC: + case EXCP_TYPE_EL2_IRQ: + case EXCP_TYPE_EL2_FIQ: + case EXCP_TYPE_EL2_ERROR: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + default: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + } + + return (handled); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) +{ + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +/* log2 of the number of bytes in a page table entry */ +#define PTE_SHIFT 3 +int +vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla, + int prot, uint64_t *gpa, int *is_fault) +{ + struct hypctx *hypctx; + void *cookie; + uint64_t mask, *ptep, pte, pte_addr; + int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz; + bool is_el0; + + /* Check if the MMU is off */ + if ((paging->flags & VM_GP_MMU_ENABLED) == 0) { + *is_fault = 0; + *gpa = gla; + return (0); + } + + is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t; + + if (ADDR_IS_KERNEL(gla)) { + /* If address translation is disabled raise an exception */ + if ((paging->tcr_el1 & TCR_EPD1) != 0) { + *is_fault = 1; + return (0); + } + if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) { + *is_fault = 1; + return (0); + } + pte_addr = paging->ttbr1_addr; + tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT; + /* Clear the top byte if TBI is on */ + if ((paging->tcr_el1 & TCR_TBI1) != 0) + gla |= (0xfful << 56); + switch (paging->tcr_el1 & TCR_TG1_MASK) { + case TCR_TG1_4K: + granule_shift = PAGE_SHIFT_4K; + break; + case TCR_TG1_16K: + granule_shift = PAGE_SHIFT_16K; + break; + case TCR_TG1_64K: + granule_shift = PAGE_SHIFT_64K; + break; + default: + *is_fault = 1; + return (EINVAL); + } + } else { + /* If address translation is disabled raise an exception */ + if ((paging->tcr_el1 & TCR_EPD0) != 0) { + *is_fault = 1; + return (0); + } + if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) { + *is_fault = 1; + return (0); + } + pte_addr = paging->ttbr0_addr; + tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT; + /* Clear the top byte if TBI is on */ + if ((paging->tcr_el1 & TCR_TBI0) != 0) + gla &= ~(0xfful << 56); + switch (paging->tcr_el1 & TCR_TG0_MASK) { + case TCR_TG0_4K: + granule_shift = PAGE_SHIFT_4K; + break; + case TCR_TG0_16K: + granule_shift = PAGE_SHIFT_16K; + break; + case TCR_TG0_64K: + granule_shift = PAGE_SHIFT_64K; + break; + default: + *is_fault = 1; + return (EINVAL); + } + } + + /* + * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2 + * for larger values. + */ + switch (granule_shift) { + case PAGE_SHIFT_4K: + case PAGE_SHIFT_16K: + /* + * See "Table D8-11 4KB granule, determining stage 1 initial + * lookup level" and "Table D8-21 16KB granule, determining + * stage 1 initial lookup level" from the "Arm Architecture + * Reference Manual for A-Profile architecture" revision I.a + * for the minimum and maximum values. + * + * TODO: Support less than 16 when FEAT_LPA2 is implemented + * and TCR_EL1.DS == 1 + * TODO: Support more than 39 when FEAT_TTST is implemented + */ + if (tsz < 16 || tsz > 39) { + *is_fault = 1; + return (EINVAL); + } + break; + case PAGE_SHIFT_64K: + /* TODO: Support 64k granule. It will probably work, but is untested */ + default: + *is_fault = 1; + return (EINVAL); + } + + /* + * Calculate the input address bits. These are 64 bit in an address + * with the top tsz bits being all 0 or all 1. + */ + ia_bits = 64 - tsz; + + /* + * Calculate the number of address bits used in the page table + * calculation. This is ia_bits minus the bottom granule_shift + * bits that are passed to the output address. + */ + address_bits = ia_bits - granule_shift; + + /* + * Calculate the number of levels. Each level uses + * granule_shift - PTE_SHIFT bits of the input address. + * This is because the table is 1 << granule_shift and each + * entry is 1 << PTE_SHIFT bytes. + */ + levels = howmany(address_bits, granule_shift - PTE_SHIFT); + + /* Mask of the upper unused bits in the virtual address */ + gla &= (1ul << ia_bits) - 1; + hypctx = (struct hypctx *)vcpui; + cookie = NULL; + /* TODO: Check if the level supports block descriptors */ + for (;levels > 0; levels--) { + int idx; + + pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) + + granule_shift; + idx = (gla >> pte_shift) & + ((1ul << (granule_shift - PTE_SHIFT)) - 1); + while (idx > PAGE_SIZE / sizeof(pte)) { + idx -= PAGE_SIZE / sizeof(pte); + pte_addr += PAGE_SIZE; + } + + ptep = ptp_hold(hypctx->vcpu, pte_addr, PAGE_SIZE, &cookie); + if (ptep == NULL) + goto error; + pte = ptep[idx]; + + /* Calculate the level we are looking at */ + switch (levels) { + default: + goto fault; + /* TODO: Level -1 when FEAT_LPA2 is implemented */ + case 4: /* Level 0 */ + if ((pte & ATTR_DESCR_MASK) != L0_TABLE) + goto fault; + /* FALLTHROUGH */ + case 3: /* Level 1 */ + case 2: /* Level 2 */ + switch (pte & ATTR_DESCR_MASK) { + /* Use L1 macro as all levels are the same */ + case L1_TABLE: + /* Check if EL0 can access this address space */ + if (is_el0 && + (pte & TATTR_AP_TABLE_NO_EL0) != 0) + goto fault; + /* Check if the address space is writable */ + if ((prot & PROT_WRITE) != 0 && + (pte & TATTR_AP_TABLE_RO) != 0) + goto fault; + if ((prot & PROT_EXEC) != 0) { + /* Check the table exec attribute */ + if ((is_el0 && + (pte & TATTR_UXN_TABLE) != 0) || + (!is_el0 && + (pte & TATTR_PXN_TABLE) != 0)) + goto fault; + } + pte_addr = pte & ~ATTR_MASK; + break; + case L1_BLOCK: + goto done; + default: + goto fault; + } + break; + case 1: /* Level 3 */ + if ((pte & ATTR_DESCR_MASK) == L3_PAGE) + goto done; + goto fault; + } + } + +done: + /* Check if EL0 has access to the block/page */ + if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0) + goto fault; + if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0) + goto fault; + if ((prot & PROT_EXEC) != 0) { + if ((is_el0 && (pte & ATTR_S1_UXN) != 0) || + (!is_el0 && (pte & ATTR_S1_PXN) != 0)) + goto fault; + } + mask = (1ul << pte_shift) - 1; + *gpa = (pte & ~ATTR_MASK) | (gla & mask); + *is_fault = 0; + ptp_release(&cookie); + return (0); + +error: + ptp_release(&cookie); + return (EFAULT); +fault: + *is_fault = 1; + ptp_release(&cookie); + return (0); +} + +int +vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo) +{ + uint64_t excp_type; + int handled; + register_t daif; + struct hyp *hyp; + struct hypctx *hypctx; + struct vcpu *vcpu; + struct vm_exit *vme; + int mode; + + hypctx = (struct hypctx *)vcpui; + hyp = hypctx->hyp; + vcpu = hypctx->vcpu; + vme = vm_exitinfo(vcpu); + + hypctx->tf.tf_elr = (uint64_t)pc; + + for (;;) { + if (hypctx->has_exception) { + hypctx->has_exception = false; + hypctx->elr_el1 = hypctx->tf.tf_elr; + + mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); + + if (mode == PSR_M_EL1t) { + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0; + } else if (mode == PSR_M_EL1h) { + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200; + } else if ((mode & PSR_M_32) == PSR_M_64) { + /* 64-bit EL0 */ + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400; + } else { + /* 32-bit EL0 */ + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600; + } + + /* Set the new spsr */ + hypctx->spsr_el1 = hypctx->tf.tf_spsr; + + /* Set the new cpsr */ + hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS; + hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h; + + /* + * Update fields that may change on exeption entry + * based on how sctlr_el1 is configured. + */ + if ((hypctx->sctlr_el1 & SCTLR_SPAN) != 0) + hypctx->tf.tf_spsr |= PSR_PAN; + if ((hypctx->sctlr_el1 & SCTLR_DSSBS) == 0) + hypctx->tf.tf_spsr &= ~PSR_SSBS; + else + hypctx->tf.tf_spsr |= PSR_SSBS; + } + + daif = intr_disable(); + + /* Check if the vcpu is suspended */ + if (vcpu_suspended(evinfo)) { + intr_restore(daif); + vm_exit_suspended(vcpu, pc); + break; + } + + if (vcpu_debugged(vcpu)) { + intr_restore(daif); + vm_exit_debug(vcpu, pc); + break; + } + + /* Activate the stage2 pmap so the vmid is valid */ + pmap_activate_vm(pmap); + hyp->vttbr_el2 = pmap_to_ttbr0(pmap); + + /* + * TODO: What happens if a timer interrupt is asserted exactly + * here, but for the previous VM? + */ + arm64_set_active_vcpu(hypctx); + vgic_flush_hwstate(hypctx); + + /* Call into EL2 to switch to the guest */ + excp_type = vmm_call_hyp(HYP_ENTER_GUEST, + hyp->el2_addr, hypctx->el2_addr); + + vgic_sync_hwstate(hypctx); + vtimer_sync_hwstate(hypctx); + + /* + * Deactivate the stage2 pmap. vmm_pmap_clean_stage2_tlbi + * depends on this meaning we activate the VM before entering + * the vm again + */ + PCPU_SET(curvmpmap, NULL); + intr_restore(daif); + + vmm_stat_incr(vcpu, VMEXIT_COUNT, 1); + if (excp_type == EXCP_TYPE_MAINT_IRQ) + continue; + + vme->pc = hypctx->tf.tf_elr; + vme->inst_length = INSN_SIZE; + vme->u.hyp.exception_nr = excp_type; + vme->u.hyp.esr_el2 = hypctx->tf.tf_esr; + vme->u.hyp.far_el2 = hypctx->exit_info.far_el2; + vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2; + + handled = arm64_handle_world_switch(hypctx, excp_type, vme, + pmap); + if (handled == UNHANDLED) + /* Exit loop to emulate instruction. */ + break; + else + /* Resume guest execution from the next instruction. */ + hypctx->tf.tf_elr += vme->inst_length; + } + + return (0); +} + +static void +arm_pcpu_vmcleanup(void *arg) +{ + struct hyp *hyp; + int i, maxcpus; + + hyp = arg; + maxcpus = vm_get_maxcpus(hyp->vm); + for (i = 0; i < maxcpus; i++) { + if (arm64_get_active_vcpu() == hyp->ctx[i]) { + arm64_set_active_vcpu(NULL); + break; + } + } +} + +void +vmmops_vcpu_cleanup(void *vcpui) +{ + struct hypctx *hypctx = vcpui; + + vtimer_cpucleanup(hypctx); + vgic_cpucleanup(hypctx); + + vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true); + + free(hypctx, M_HYP); +} + +void +vmmops_cleanup(void *vmi) +{ + struct hyp *hyp = vmi; + + vtimer_vmcleanup(hyp); + vgic_vmcleanup(hyp); + + smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp); + + vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true); + + free(hyp, M_HYP); +} + +/* + * Return register value. Registers have different sizes and an explicit cast + * must be made to ensure proper conversion. + */ +static uint64_t * +hypctx_regptr(struct hypctx *hypctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29: + return (&hypctx->tf.tf_x[reg]); + case VM_REG_GUEST_LR: + return (&hypctx->tf.tf_lr); + case VM_REG_GUEST_SP: + return (&hypctx->tf.tf_sp); + case VM_REG_GUEST_CPSR: + return (&hypctx->tf.tf_spsr); + case VM_REG_GUEST_PC: + return (&hypctx->tf.tf_elr); + case VM_REG_GUEST_SCTLR_EL1: + return (&hypctx->sctlr_el1); + case VM_REG_GUEST_TTBR0_EL1: + return (&hypctx->ttbr0_el1); + case VM_REG_GUEST_TTBR1_EL1: + return (&hypctx->ttbr1_el1); + case VM_REG_GUEST_TCR_EL1: + return (&hypctx->tcr_el1); + case VM_REG_GUEST_TCR2_EL1: + return (&hypctx->tcr2_el1); + default: + break; + } + return (NULL); +} + +int +vmmops_getreg(void *vcpui, int reg, uint64_t *retval) +{ + uint64_t *regp; + int running, hostcpu; + struct hypctx *hypctx = vcpui; + + running = vcpu_is_running(hypctx->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm), + vcpu_vcpuid(hypctx->vcpu)); + + regp = hypctx_regptr(hypctx, reg); + if (regp == NULL) + return (EINVAL); + + *retval = *regp; + return (0); +} + +int +vmmops_setreg(void *vcpui, int reg, uint64_t val) +{ + uint64_t *regp; + struct hypctx *hypctx = vcpui; + int running, hostcpu; + + running = vcpu_is_running(hypctx->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm), + vcpu_vcpuid(hypctx->vcpu)); + + regp = hypctx_regptr(hypctx, reg); + if (regp == NULL) + return (EINVAL); + + *regp = val; + return (0); +} + +int +vmmops_exception(void *vcpui, uint64_t esr, uint64_t far) +{ + struct hypctx *hypctx = vcpui; + int running, hostcpu; + + running = vcpu_is_running(hypctx->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm), + vcpu_vcpuid(hypctx->vcpu)); + + hypctx->far_el1 = far; + hypctx->esr_el1 = esr; + hypctx->has_exception = true; + + return (0); +} + +int +vmmops_getcap(void *vcpui, int num, int *retval) +{ + int ret; + + ret = ENOENT; + + switch (num) { + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; + ret = 0; + break; + default: + break; + } + + return (ret); +} + +int +vmmops_setcap(void *vcpui, int num, int val) +{ + + return (ENOENT); +} diff --git a/sys/arm64/vmm/vmm_call.S b/sys/arm64/vmm/vmm_call.S new file mode 100644 index 000000000000..fc28e3f173eb --- /dev/null +++ b/sys/arm64/vmm/vmm_call.S @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei <alexandru.elisei@gmail.com> + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include <machine/asm.h> + + .text + +ENTRY(vmm_call_hyp) + hvc #0 + ret +END(vmm_call_hyp) diff --git a/sys/arm64/vmm/vmm_dev.c b/sys/arm64/vmm/vmm_dev.c new file mode 100644 index 000000000000..9f405384f2b3 --- /dev/null +++ b/sys/arm64/vmm/vmm_dev.c @@ -0,0 +1,1054 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/jail.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <sys/conf.h> +#include <sys/sysctl.h> +#include <sys/libkern.h> +#include <sys/ioccom.h> +#include <sys/mman.h> +#include <sys/uio.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> + +#include <machine/machdep.h> +#include <machine/vmparam.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmm_stat.h" + +#include "io/vgic.h" + +struct devmem_softc { + int segid; + char *name; + struct cdev *cdev; + struct vmmdev_softc *sc; + SLIST_ENTRY(devmem_softc) link; +}; + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + struct ucred *ucred; + SLIST_ENTRY(vmmdev_softc) link; + SLIST_HEAD(, devmem_softc) devmem; + int flags; +}; +#define VSC_LINKED 0x01 + +static SLIST_HEAD(, vmmdev_softc) head; + +static unsigned pr_allow_flag; +static struct mtx vmmdev_mtx; +MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF); + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static int vmm_priv_check(struct ucred *ucred); +static int devmem_create_cdev(const char *vmname, int id, char *devmem); +static void devmem_destroy(void *arg); + +static int +vmm_priv_check(struct ucred *ucred) +{ + + if (jailed(ucred) && + !(ucred->cr_prison->pr_allow & pr_allow_flag)) + return (EPERM); + + return (0); +} + +static int +vcpu_lock_one(struct vcpu *vcpu) +{ + int error; + + error = vcpu_set_state(vcpu, VCPU_FROZEN, true); + return (error); +} + +static void +vcpu_unlock_one(struct vcpu *vcpu) +{ + enum vcpu_state state; + + state = vcpu_get_state(vcpu, NULL); + if (state != VCPU_FROZEN) { + panic("vcpu %s(%d) has invalid state %d", + vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state); + } + + vcpu_set_state(vcpu, VCPU_IDLE, false); +} + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + struct vcpu *vcpu; + int error; + uint16_t i, j, maxcpus; + + error = 0; + vm_slock_vcpus(sc->vm); + maxcpus = vm_get_maxcpus(sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(sc->vm, i); + if (vcpu == NULL) + continue; + error = vcpu_lock_one(vcpu); + if (error) + break; + } + + if (error) { + for (j = 0; j < i; j++) { + vcpu = vm_vcpu(sc->vm, j); + if (vcpu == NULL) + continue; + vcpu_unlock_one(vcpu); + } + vm_unlock_vcpus(sc->vm); + } + + return (error); +} + +static void +vcpu_unlock_all(struct vmmdev_softc *sc) +{ + struct vcpu *vcpu; + uint16_t i, maxcpus; + + maxcpus = vm_get_maxcpus(sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(sc->vm, i); + if (vcpu == NULL) + continue; + vcpu_unlock_one(vcpu); + } + vm_unlock_vcpus(sc->vm); +} + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + if (sc == NULL) + return (NULL); + + if (cr_cansee(curthread->td_ucred, sc->ucred)) + return (NULL); + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + + return (cdev->si_drv1); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c, prot; + vm_paddr_t gpa, maxaddr; + void *hpa, *cookie; + struct vmmdev_softc *sc; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + /* + * Get a read lock on the guest memory map. + */ + vm_slock_memsegs(sc->vm); + + prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); + maxaddr = vmm_sysmem_maxaddr(sc->vm); + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); + if (hpa == NULL) { + if (uio->uio_rw == UIO_READ && gpa < maxaddr) + error = uiomove(__DECONST(void *, zero_region), + c, uio); + else + error = EFAULT; + } else { + error = uiomove(hpa, c, uio); + vm_gpa_release(cookie); + } + } + vm_unlock_memsegs(sc->vm); + return (error); +} + +static int +get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + struct devmem_softc *dsc; + int error; + bool sysmem; + + error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); + if (error || mseg->len == 0) + return (error); + + if (!sysmem) { + SLIST_FOREACH(dsc, &sc->devmem, link) { + if (dsc->segid == mseg->segid) + break; + } + KASSERT(dsc != NULL, ("%s: devmem segment %d not found", + __func__, mseg->segid)); + error = copystr(dsc->name, mseg->name, sizeof(mseg->name), + NULL); + } else { + bzero(mseg->name, sizeof(mseg->name)); + } + + return (error); +} + +static int +alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + char *name; + int error; + bool sysmem; + + error = 0; + name = NULL; + sysmem = true; + + /* + * The allocation is lengthened by 1 to hold a terminating NUL. It'll + * by stripped off when devfs processes the full string. + */ + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; + name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK); + error = copystr(mseg->name, name, sizeof(mseg->name), NULL); + if (error) + goto done; + } + + error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); + if (error) + goto done; + + if (VM_MEMSEG_NAME(mseg)) { + error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); + if (error) + vm_free_memseg(sc->vm, mseg->segid); + else + name = NULL; /* freed when 'cdev' is destroyed */ + } +done: + free(name, M_VMMDEV); + return (error); +} + +static int +vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, + uint64_t *regval) +{ + int error, i; + + error = 0; + for (i = 0; i < count; i++) { + error = vm_get_register(vcpu, regnum[i], ®val[i]); + if (error) + break; + } + return (error); +} + +static int +vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, + uint64_t *regval) +{ + int error, i; + + error = 0; + for (i = 0; i < count; i++) { + error = vm_set_register(vcpu, regnum[i], regval[i]); + if (error) + break; + } + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpuid, size; + cpuset_t *cpuset; + struct vmmdev_softc *sc; + struct vcpu *vcpu; + struct vm_register *vmreg; + struct vm_register_set *vmregset; + struct vm_run *vmrun; + struct vm_vgic_version *vgv; + struct vm_vgic_descr *vgic; + struct vm_cpuset *vm_cpuset; + struct vm_irq *vi; + struct vm_capability *vmcap; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + struct vm_suspend *vmsuspend; + struct vm_exception *vmexc; + struct vm_gla2gpa *gg; + struct vm_memmap *mm; + struct vm_munmap *mu; + struct vm_msi *vmsi; + struct vm_cpu_topology *topology; + uint64_t *regvals; + int *regnums; + enum { NONE, SINGLE, ALL } vcpus_locked; + bool memsegs_locked; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + error = 0; + vcpuid = -1; + vcpu = NULL; + vcpus_locked = NONE; + memsegs_locked = false; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: + case VM_INJECT_EXCEPTION: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_GLA2GPA_NOFAULT: + case VM_ACTIVATE_CPU: + /* + * ioctls that can operate only on vcpus that are not running. + */ + vcpuid = *(int *)data; + vcpu = vm_alloc_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + error = vcpu_lock_one(vcpu); + if (error) + goto done; + vcpus_locked = SINGLE; + break; + + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_MUNMAP_MEMSEG: + case VM_REINIT: + case VM_ATTACH_VGIC: + /* + * ioctls that modify the memory map must lock memory + * segments exclusively. + */ + vm_xlock_memsegs(sc->vm); + memsegs_locked = true; + + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = vcpu_lock_all(sc); + if (error) + goto done; + vcpus_locked = ALL; + break; + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + /* + * Lock the memory map while it is being inspected. + */ + vm_slock_memsegs(sc->vm); + memsegs_locked = true; + break; + + case VM_STATS: + /* + * These do not need the vCPU locked but do operate on + * a specific vCPU. + */ + vcpuid = *(int *)data; + vcpu = vm_alloc_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + break; + + case VM_SUSPEND_CPU: + case VM_RESUME_CPU: + /* + * These can either operate on all CPUs via a vcpuid of + * -1 or on a specific vCPU. + */ + vcpuid = *(int *)data; + if (vcpuid == -1) + break; + vcpu = vm_alloc_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + break; + + case VM_ASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_assert_irq(sc->vm, vi->irq); + break; + case VM_DEASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_deassert_irq(sc->vm, vi->irq); + break; + default: + break; + } + + switch (cmd) { + case VM_RUN: { + struct vm_exit *vme; + + vmrun = (struct vm_run *)data; + vme = vm_exitinfo(vcpu); + + error = vm_run(vcpu); + if (error != 0) + break; + + error = copyout(vme, vmrun->vm_exit, sizeof(*vme)); + if (error != 0) + break; + break; + } + case VM_SUSPEND: + vmsuspend = (struct vm_suspend *)data; + error = vm_suspend(sc->vm, vmsuspend->how); + break; + case VM_REINIT: + error = vm_reinit(sc->vm); + break; + case VM_STAT_DESC: { + statdesc = (struct vm_stat_desc *)data; + error = vmm_stat_desc_copy(statdesc->index, + statdesc->desc, sizeof(statdesc->desc)); + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(vcpu, vmstats->index, + nitems(vmstats->statbuf), + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_MMAP_GETNEXT: + mm = (struct vm_memmap *)data; + error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, + &mm->segoff, &mm->len, &mm->prot, &mm->flags); + break; + case VM_MMAP_MEMSEG: + mm = (struct vm_memmap *)data; + error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, + mm->len, mm->prot, mm->flags); + break; + case VM_MUNMAP_MEMSEG: + mu = (struct vm_munmap *)data; + error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); + break; + case VM_ALLOC_MEMSEG: + error = alloc_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_MEMSEG: + error = get_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); + break; + case VM_GET_REGISTER_SET: + vmregset = (struct vm_register_set *)data; + if (vmregset->count > VM_REG_LAST) { + error = EINVAL; + break; + } + regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * + vmregset->count); + if (error == 0) + error = vm_get_register_set(vcpu, vmregset->count, + regnums, regvals); + if (error == 0) + error = copyout(regvals, vmregset->regvals, + sizeof(regvals[0]) * vmregset->count); + free(regvals, M_VMMDEV); + free(regnums, M_VMMDEV); + break; + case VM_SET_REGISTER_SET: + vmregset = (struct vm_register_set *)data; + if (vmregset->count > VM_REG_LAST) { + error = EINVAL; + break; + } + regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * + vmregset->count); + if (error == 0) + error = copyin(vmregset->regvals, regvals, + sizeof(regvals[0]) * vmregset->count); + if (error == 0) + error = vm_set_register_set(vcpu, vmregset->count, + regnums, regvals); + free(regvals, M_VMMDEV); + free(regnums, M_VMMDEV); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(vcpu, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(vcpu, + vmcap->captype, + vmcap->capval); + break; + case VM_INJECT_EXCEPTION: + vmexc = (struct vm_exception *)data; + error = vm_inject_exception(vcpu, vmexc->esr, vmexc->far); + break; + case VM_GLA2GPA_NOFAULT: + gg = (struct vm_gla2gpa *)data; + error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, + gg->prot, &gg->gpa, &gg->fault); + KASSERT(error == 0 || error == EFAULT, + ("%s: vm_gla2gpa unknown error %d", __func__, error)); + break; + case VM_ACTIVATE_CPU: + error = vm_activate_cpu(vcpu); + break; + case VM_GET_CPUS: + error = 0; + vm_cpuset = (struct vm_cpuset *)data; + size = vm_cpuset->cpusetsize; + if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { + error = ERANGE; + break; + } + cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + if (vm_cpuset->which == VM_ACTIVE_CPUS) + *cpuset = vm_active_cpus(sc->vm); + else if (vm_cpuset->which == VM_SUSPENDED_CPUS) + *cpuset = vm_suspended_cpus(sc->vm); + else if (vm_cpuset->which == VM_DEBUG_CPUS) + *cpuset = vm_debug_cpus(sc->vm); + else + error = EINVAL; + if (error == 0) + error = copyout(cpuset, vm_cpuset->cpus, size); + free(cpuset, M_TEMP); + break; + case VM_SUSPEND_CPU: + error = vm_suspend_cpu(sc->vm, vcpu); + break; + case VM_RESUME_CPU: + error = vm_resume_cpu(sc->vm, vcpu); + break; + case VM_GET_VGIC_VERSION: + vgv = (struct vm_vgic_version *)data; + /* TODO: Query the vgic driver for this */ + vgv->version = 3; + vgv->flags = 0; + error = 0; + break; + case VM_ATTACH_VGIC: + vgic = (struct vm_vgic_descr *)data; + error = vm_attach_vgic(sc->vm, vgic); + break; + case VM_RAISE_MSI: + vmsi = (struct vm_msi *)data; + error = vm_raise_msi(sc->vm, vmsi->msg, vmsi->addr, vmsi->bus, + vmsi->slot, vmsi->func); + break; + case VM_SET_TOPOLOGY: + topology = (struct vm_cpu_topology *)data; + error = vm_set_topology(sc->vm, topology->sockets, + topology->cores, topology->threads, topology->maxcpus); + break; + case VM_GET_TOPOLOGY: + topology = (struct vm_cpu_topology *)data; + vm_get_topology(sc->vm, &topology->sockets, &topology->cores, + &topology->threads, &topology->maxcpus); + error = 0; + break; + default: + error = ENOTTY; + break; + } + +done: + if (vcpus_locked == SINGLE) + vcpu_unlock_one(vcpu); + else if (vcpus_locked == ALL) + vcpu_unlock_all(sc); + if (memsegs_locked) + vm_unlock_memsegs(sc->vm); + + /* + * Make sure that no handler returns a kernel-internal + * error value to userspace. + */ + KASSERT(error == ERESTART || error >= 0, + ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static int +vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, + struct vm_object **objp, int nprot) +{ + struct vmmdev_softc *sc; + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff, first, last; + int error, found, segid; + bool sysmem; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + first = *offset; + last = first + mapsize; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) { + /* virtual machine is in the process of being created */ + return (EINVAL); + } + + /* + * Get a read lock on the guest memory map. + */ + vm_slock_memsegs(sc->vm); + + gpa = 0; + found = 0; + while (!found) { + error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, + NULL, NULL); + if (error) + break; + + if (first >= gpa && last <= gpa + len) + found = 1; + else + gpa += len; + } + + if (found) { + error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); + KASSERT(error == 0 && *objp != NULL, + ("%s: invalid memory segment %d", __func__, segid)); + if (sysmem) { + vm_object_reference(*objp); + *offset = segoff + (first - gpa); + } else { + error = EINVAL; + } + } + vm_unlock_memsegs(sc->vm); + return (error); +} + +static void +vmmdev_destroy(void *arg) +{ + struct vmmdev_softc *sc = arg; + struct devmem_softc *dsc; + int error __diagused; + + error = vcpu_lock_all(sc); + KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); + vm_unlock_vcpus(sc->vm); + + while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { + KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); + SLIST_REMOVE_HEAD(&sc->devmem, link); + free(dsc->name, M_VMMDEV); + free(dsc, M_VMMDEV); + } + + if (sc->cdev != NULL) + destroy_dev(sc->cdev); + + if (sc->vm != NULL) + vm_destroy(sc->vm); + + if (sc->ucred != NULL) + crfree(sc->ucred); + + if ((sc->flags & VSC_LINKED) != 0) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + char *buf; + int error, buflen; + + error = vmm_priv_check(req->td->td_ucred); + if (error) + return (error); + + buflen = VM_MAX_NAMELEN + 1; + buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); + strlcpy(buf, "beavis", buflen); + error = sysctl_handle_string(oidp, buf, buflen, req); + if (error != 0 || req->newptr == NULL) + goto out; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL || sc->cdev == NULL) { + mtx_unlock(&vmmdev_mtx); + error = EINVAL; + goto out; + } + + /* + * Setting 'sc->cdev' to NULL is used to indicate that the VM + * is scheduled for destruction. + */ + cdev = sc->cdev; + sc->cdev = NULL; + mtx_unlock(&vmmdev_mtx); + + /* + * Destroy all cdevs: + * + * - any new operations on the 'cdev' will return an error (ENXIO). + * + * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' + */ + SLIST_FOREACH(dsc, &sc->devmem, link) { + KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); + destroy_dev(dsc->cdev); + devmem_destroy(dsc); + } + destroy_dev(cdev); + vmmdev_destroy(sc); + error = 0; + +out: + free(buf, M_VMMDEV); + return (error); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, + NULL, 0, sysctl_vmm_destroy, "A", + NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap_single = vmmdev_mmap_single, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + struct vm *vm; + struct cdev *cdev; + struct vmmdev_softc *sc, *sc2; + char *buf; + int error, buflen; + + error = vmm_priv_check(req->td->td_ucred); + if (error) + return (error); + + buflen = VM_MAX_NAMELEN + 1; + buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); + strlcpy(buf, "beavis", buflen); + error = sysctl_handle_string(oidp, buf, buflen, req); + if (error != 0 || req->newptr == NULL) + goto out; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) { + error = EEXIST; + goto out; + } + + error = vm_create(buf, &vm); + if (error != 0) + goto out; + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->ucred = crhold(curthread->td_ucred); + sc->vm = vm; + SLIST_INIT(&sc->devmem); + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) { + SLIST_INSERT_HEAD(&head, sc, link); + sc->flags |= VSC_LINKED; + } + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc); + error = EEXIST; + goto out; + } + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred, + UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); + if (error != 0) { + vmmdev_destroy(sc); + goto out; + } + + mtx_lock(&vmmdev_mtx); + sc->cdev = cdev; + sc->cdev->si_drv1 = sc; + mtx_unlock(&vmmdev_mtx); + +out: + free(buf, M_VMMDEV); + return (error); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, + NULL, 0, sysctl_vmm_create, "A", + NULL); + +void +vmmdev_init(void) +{ + pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, + "Allow use of vmm in a jail."); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + return (error); +} + +static int +devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, + struct vm_object **objp, int nprot) +{ + struct devmem_softc *dsc; + vm_ooffset_t first, last; + size_t seglen; + int error; + bool sysmem; + + dsc = cdev->si_drv1; + if (dsc == NULL) { + /* 'cdev' has been created but is not ready for use */ + return (ENXIO); + } + + first = *offset; + last = *offset + len; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + vm_slock_memsegs(dsc->sc->vm); + + error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); + KASSERT(error == 0 && !sysmem && *objp != NULL, + ("%s: invalid devmem segment %d", __func__, dsc->segid)); + + if (seglen >= last) + vm_object_reference(*objp); + else + error = 0; + vm_unlock_memsegs(dsc->sc->vm); + return (error); +} + +static struct cdevsw devmemsw = { + .d_name = "devmem", + .d_version = D_VERSION, + .d_mmap_single = devmem_mmap_single, +}; + +static int +devmem_create_cdev(const char *vmname, int segid, char *devname) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + int error; + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); + if (error) + return (error); + + dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(vmname); + KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); + if (sc->cdev == NULL) { + /* virtual machine is being created or destroyed */ + mtx_unlock(&vmmdev_mtx); + free(dsc, M_VMMDEV); + destroy_dev_sched_cb(cdev, NULL, 0); + return (ENODEV); + } + + dsc->segid = segid; + dsc->name = devname; + dsc->cdev = cdev; + dsc->sc = sc; + SLIST_INSERT_HEAD(&sc->devmem, dsc, link); + mtx_unlock(&vmmdev_mtx); + + /* The 'cdev' is ready for use after 'si_drv1' is initialized */ + cdev->si_drv1 = dsc; + return (0); +} + +static void +devmem_destroy(void *arg) +{ + struct devmem_softc *dsc = arg; + + KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); + dsc->cdev = NULL; + dsc->sc = NULL; +} diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c new file mode 100644 index 000000000000..9ff250e798e7 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp.c @@ -0,0 +1,735 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <sys/proc.h> + +#include <machine/armreg.h> + +#include "arm64.h" +#include "hyp.h" + +struct hypctx; + +uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t); +uint64_t vmm_enter_guest(struct hypctx *); + +static void +vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) +{ + uint64_t dfr0; + + /* Store the guest VFP registers */ + if (guest) { + /* Store the timer registers */ + hypctx->vtimer_cpu.cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 = + READ_SPECIALREG(cntv_cval_el0); + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0 = + READ_SPECIALREG(cntv_ctl_el0); + + /* Store the GICv3 registers */ + hypctx->vgic_v3_regs.ich_eisr_el2 = + READ_SPECIALREG(ich_eisr_el2); + hypctx->vgic_v3_regs.ich_elrsr_el2 = + READ_SPECIALREG(ich_elrsr_el2); + hypctx->vgic_v3_regs.ich_hcr_el2 = + READ_SPECIALREG(ich_hcr_el2); + hypctx->vgic_v3_regs.ich_misr_el2 = + READ_SPECIALREG(ich_misr_el2); + hypctx->vgic_v3_regs.ich_vmcr_el2 = + READ_SPECIALREG(ich_vmcr_el2); + switch (hypctx->vgic_v3_regs.ich_lr_num - 1) { +#define STORE_LR(x) \ + case x: \ + hypctx->vgic_v3_regs.ich_lr_el2[x] = \ + READ_SPECIALREG(ich_lr ## x ##_el2) + STORE_LR(15); + STORE_LR(14); + STORE_LR(13); + STORE_LR(12); + STORE_LR(11); + STORE_LR(10); + STORE_LR(9); + STORE_LR(8); + STORE_LR(7); + STORE_LR(6); + STORE_LR(5); + STORE_LR(4); + STORE_LR(3); + STORE_LR(2); + STORE_LR(1); + default: + STORE_LR(0); +#undef STORE_LR + } + + switch (hypctx->vgic_v3_regs.ich_apr_num - 1) { +#define STORE_APR(x) \ + case x: \ + hypctx->vgic_v3_regs.ich_ap0r_el2[x] = \ + READ_SPECIALREG(ich_ap0r ## x ##_el2); \ + hypctx->vgic_v3_regs.ich_ap1r_el2[x] = \ + READ_SPECIALREG(ich_ap1r ## x ##_el2) + STORE_APR(3); + STORE_APR(2); + STORE_APR(1); + default: + STORE_APR(0); +#undef STORE_APR + } + } + + dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); + switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { +#define STORE_DBG_BRP(x) \ + case x: \ + hypctx->dbgbcr_el1[x] = \ + READ_SPECIALREG(dbgbcr ## x ## _el1); \ + hypctx->dbgbvr_el1[x] = \ + READ_SPECIALREG(dbgbvr ## x ## _el1) + STORE_DBG_BRP(15); + STORE_DBG_BRP(14); + STORE_DBG_BRP(13); + STORE_DBG_BRP(12); + STORE_DBG_BRP(11); + STORE_DBG_BRP(10); + STORE_DBG_BRP(9); + STORE_DBG_BRP(8); + STORE_DBG_BRP(7); + STORE_DBG_BRP(6); + STORE_DBG_BRP(5); + STORE_DBG_BRP(4); + STORE_DBG_BRP(3); + STORE_DBG_BRP(2); + STORE_DBG_BRP(1); + default: + STORE_DBG_BRP(0); +#undef STORE_DBG_BRP + } + + switch (ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { +#define STORE_DBG_WRP(x) \ + case x: \ + hypctx->dbgwcr_el1[x] = \ + READ_SPECIALREG(dbgwcr ## x ## _el1); \ + hypctx->dbgwvr_el1[x] = \ + READ_SPECIALREG(dbgwvr ## x ## _el1) + STORE_DBG_WRP(15); + STORE_DBG_WRP(14); + STORE_DBG_WRP(13); + STORE_DBG_WRP(12); + STORE_DBG_WRP(11); + STORE_DBG_WRP(10); + STORE_DBG_WRP(9); + STORE_DBG_WRP(8); + STORE_DBG_WRP(7); + STORE_DBG_WRP(6); + STORE_DBG_WRP(5); + STORE_DBG_WRP(4); + STORE_DBG_WRP(3); + STORE_DBG_WRP(2); + STORE_DBG_WRP(1); + default: + STORE_DBG_WRP(0); +#undef STORE_DBG_WRP + } + + /* Store the PMU registers */ + hypctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0); + hypctx->pmccntr_el0 = READ_SPECIALREG(pmccntr_el0); + hypctx->pmccfiltr_el0 = READ_SPECIALREG(pmccfiltr_el0); + hypctx->pmcntenset_el0 = READ_SPECIALREG(pmcntenset_el0); + hypctx->pmintenset_el1 = READ_SPECIALREG(pmintenset_el1); + hypctx->pmovsset_el0 = READ_SPECIALREG(pmovsset_el0); + hypctx->pmuserenr_el0 = READ_SPECIALREG(pmuserenr_el0); + switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { +#define STORE_PMU(x) \ + case (x + 1): \ + hypctx->pmevcntr_el0[x] = \ + READ_SPECIALREG(pmevcntr ## x ## _el0); \ + hypctx->pmevtyper_el0[x] = \ + READ_SPECIALREG(pmevtyper ## x ## _el0) + STORE_PMU(30); + STORE_PMU(29); + STORE_PMU(28); + STORE_PMU(27); + STORE_PMU(26); + STORE_PMU(25); + STORE_PMU(24); + STORE_PMU(23); + STORE_PMU(22); + STORE_PMU(21); + STORE_PMU(20); + STORE_PMU(19); + STORE_PMU(18); + STORE_PMU(17); + STORE_PMU(16); + STORE_PMU(15); + STORE_PMU(14); + STORE_PMU(13); + STORE_PMU(12); + STORE_PMU(11); + STORE_PMU(10); + STORE_PMU(9); + STORE_PMU(8); + STORE_PMU(7); + STORE_PMU(6); + STORE_PMU(5); + STORE_PMU(4); + STORE_PMU(3); + STORE_PMU(2); + STORE_PMU(1); + STORE_PMU(0); + default: /* N == 0 when only PMCCNTR_EL0 is available */ + break; +#undef STORE_PMU + } + + /* Store the special to from the trapframe */ + hypctx->tf.tf_sp = READ_SPECIALREG(sp_el1); + hypctx->tf.tf_elr = READ_SPECIALREG(elr_el2); + hypctx->tf.tf_spsr = READ_SPECIALREG(spsr_el2); + if (guest) { + hypctx->tf.tf_esr = READ_SPECIALREG(esr_el2); + } + + /* Store the guest special registers */ + hypctx->elr_el1 = READ_SPECIALREG(elr_el1); + hypctx->sp_el0 = READ_SPECIALREG(sp_el0); + hypctx->tpidr_el0 = READ_SPECIALREG(tpidr_el0); + hypctx->tpidrro_el0 = READ_SPECIALREG(tpidrro_el0); + hypctx->tpidr_el1 = READ_SPECIALREG(tpidr_el1); + hypctx->vbar_el1 = READ_SPECIALREG(vbar_el1); + + hypctx->actlr_el1 = READ_SPECIALREG(actlr_el1); + hypctx->afsr0_el1 = READ_SPECIALREG(afsr0_el1); + hypctx->afsr1_el1 = READ_SPECIALREG(afsr1_el1); + hypctx->amair_el1 = READ_SPECIALREG(amair_el1); + hypctx->contextidr_el1 = READ_SPECIALREG(contextidr_el1); + hypctx->cpacr_el1 = READ_SPECIALREG(cpacr_el1); + hypctx->csselr_el1 = READ_SPECIALREG(csselr_el1); + hypctx->esr_el1 = READ_SPECIALREG(esr_el1); + hypctx->far_el1 = READ_SPECIALREG(far_el1); + hypctx->mair_el1 = READ_SPECIALREG(mair_el1); + hypctx->mdccint_el1 = READ_SPECIALREG(mdccint_el1); + hypctx->mdscr_el1 = READ_SPECIALREG(mdscr_el1); + hypctx->par_el1 = READ_SPECIALREG(par_el1); + hypctx->sctlr_el1 = READ_SPECIALREG(sctlr_el1); + hypctx->spsr_el1 = READ_SPECIALREG(spsr_el1); + hypctx->tcr_el1 = READ_SPECIALREG(tcr_el1); + /* TODO: Support when this is not res0 */ + hypctx->tcr2_el1 = 0; + hypctx->ttbr0_el1 = READ_SPECIALREG(ttbr0_el1); + hypctx->ttbr1_el1 = READ_SPECIALREG(ttbr1_el1); + + hypctx->cptr_el2 = READ_SPECIALREG(cptr_el2); + hypctx->hcr_el2 = READ_SPECIALREG(hcr_el2); + hypctx->vpidr_el2 = READ_SPECIALREG(vpidr_el2); + hypctx->vmpidr_el2 = READ_SPECIALREG(vmpidr_el2); +} + +static void +vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) +{ + uint64_t dfr0; + + /* Restore the special registers */ + WRITE_SPECIALREG(elr_el1, hypctx->elr_el1); + WRITE_SPECIALREG(sp_el0, hypctx->sp_el0); + WRITE_SPECIALREG(tpidr_el0, hypctx->tpidr_el0); + WRITE_SPECIALREG(tpidrro_el0, hypctx->tpidrro_el0); + WRITE_SPECIALREG(tpidr_el1, hypctx->tpidr_el1); + WRITE_SPECIALREG(vbar_el1, hypctx->vbar_el1); + + WRITE_SPECIALREG(actlr_el1, hypctx->actlr_el1); + WRITE_SPECIALREG(afsr0_el1, hypctx->afsr0_el1); + WRITE_SPECIALREG(afsr1_el1, hypctx->afsr1_el1); + WRITE_SPECIALREG(amair_el1, hypctx->amair_el1); + WRITE_SPECIALREG(contextidr_el1, hypctx->contextidr_el1); + WRITE_SPECIALREG(cpacr_el1, hypctx->cpacr_el1); + WRITE_SPECIALREG(csselr_el1, hypctx->csselr_el1); + WRITE_SPECIALREG(esr_el1, hypctx->esr_el1); + WRITE_SPECIALREG(far_el1, hypctx->far_el1); + WRITE_SPECIALREG(mdccint_el1, hypctx->mdccint_el1); + WRITE_SPECIALREG(mdscr_el1, hypctx->mdscr_el1); + WRITE_SPECIALREG(mair_el1, hypctx->mair_el1); + WRITE_SPECIALREG(par_el1, hypctx->par_el1); + WRITE_SPECIALREG(sctlr_el1, hypctx->sctlr_el1); + WRITE_SPECIALREG(tcr_el1, hypctx->tcr_el1); + /* TODO: tcr2_el1 */ + WRITE_SPECIALREG(ttbr0_el1, hypctx->ttbr0_el1); + WRITE_SPECIALREG(ttbr1_el1, hypctx->ttbr1_el1); + WRITE_SPECIALREG(spsr_el1, hypctx->spsr_el1); + + WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); + WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); + WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); + WRITE_SPECIALREG(vmpidr_el2, hypctx->vmpidr_el2); + + /* Load the special regs from the trapframe */ + WRITE_SPECIALREG(sp_el1, hypctx->tf.tf_sp); + WRITE_SPECIALREG(elr_el2, hypctx->tf.tf_elr); + WRITE_SPECIALREG(spsr_el2, hypctx->tf.tf_spsr); + + /* Restore the PMU registers */ + WRITE_SPECIALREG(pmcr_el0, hypctx->pmcr_el0); + WRITE_SPECIALREG(pmccntr_el0, hypctx->pmccntr_el0); + WRITE_SPECIALREG(pmccfiltr_el0, hypctx->pmccfiltr_el0); + /* Clear all events/interrupts then enable them */ + WRITE_SPECIALREG(pmcntenclr_el0, 0xfffffffful); + WRITE_SPECIALREG(pmcntenset_el0, hypctx->pmcntenset_el0); + WRITE_SPECIALREG(pmintenclr_el1, 0xfffffffful); + WRITE_SPECIALREG(pmintenset_el1, hypctx->pmintenset_el1); + WRITE_SPECIALREG(pmovsclr_el0, 0xfffffffful); + WRITE_SPECIALREG(pmovsset_el0, hypctx->pmovsset_el0); + + switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { +#define LOAD_PMU(x) \ + case (x + 1): \ + WRITE_SPECIALREG(pmevcntr ## x ## _el0, \ + hypctx->pmevcntr_el0[x]); \ + WRITE_SPECIALREG(pmevtyper ## x ## _el0, \ + hypctx->pmevtyper_el0[x]) + LOAD_PMU(30); + LOAD_PMU(29); + LOAD_PMU(28); + LOAD_PMU(27); + LOAD_PMU(26); + LOAD_PMU(25); + LOAD_PMU(24); + LOAD_PMU(23); + LOAD_PMU(22); + LOAD_PMU(21); + LOAD_PMU(20); + LOAD_PMU(19); + LOAD_PMU(18); + LOAD_PMU(17); + LOAD_PMU(16); + LOAD_PMU(15); + LOAD_PMU(14); + LOAD_PMU(13); + LOAD_PMU(12); + LOAD_PMU(11); + LOAD_PMU(10); + LOAD_PMU(9); + LOAD_PMU(8); + LOAD_PMU(7); + LOAD_PMU(6); + LOAD_PMU(5); + LOAD_PMU(4); + LOAD_PMU(3); + LOAD_PMU(2); + LOAD_PMU(1); + LOAD_PMU(0); + default: /* N == 0 when only PMCCNTR_EL0 is available */ + break; +#undef LOAD_PMU + } + + dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); + switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { +#define LOAD_DBG_BRP(x) \ + case x: \ + WRITE_SPECIALREG(dbgbcr ## x ## _el1, \ + hypctx->dbgbcr_el1[x]); \ + WRITE_SPECIALREG(dbgbvr ## x ## _el1, \ + hypctx->dbgbvr_el1[x]) + LOAD_DBG_BRP(15); + LOAD_DBG_BRP(14); + LOAD_DBG_BRP(13); + LOAD_DBG_BRP(12); + LOAD_DBG_BRP(11); + LOAD_DBG_BRP(10); + LOAD_DBG_BRP(9); + LOAD_DBG_BRP(8); + LOAD_DBG_BRP(7); + LOAD_DBG_BRP(6); + LOAD_DBG_BRP(5); + LOAD_DBG_BRP(4); + LOAD_DBG_BRP(3); + LOAD_DBG_BRP(2); + LOAD_DBG_BRP(1); + default: + LOAD_DBG_BRP(0); +#undef LOAD_DBG_BRP + } + + switch (ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { +#define LOAD_DBG_WRP(x) \ + case x: \ + WRITE_SPECIALREG(dbgwcr ## x ## _el1, \ + hypctx->dbgwcr_el1[x]); \ + WRITE_SPECIALREG(dbgwvr ## x ## _el1, \ + hypctx->dbgwvr_el1[x]) + LOAD_DBG_WRP(15); + LOAD_DBG_WRP(14); + LOAD_DBG_WRP(13); + LOAD_DBG_WRP(12); + LOAD_DBG_WRP(11); + LOAD_DBG_WRP(10); + LOAD_DBG_WRP(9); + LOAD_DBG_WRP(8); + LOAD_DBG_WRP(7); + LOAD_DBG_WRP(6); + LOAD_DBG_WRP(5); + LOAD_DBG_WRP(4); + LOAD_DBG_WRP(3); + LOAD_DBG_WRP(2); + LOAD_DBG_WRP(1); + default: + LOAD_DBG_WRP(0); +#undef LOAD_DBG_WRP + } + + if (guest) { + /* Load the timer registers */ + WRITE_SPECIALREG(cntkctl_el1, hypctx->vtimer_cpu.cntkctl_el1); + WRITE_SPECIALREG(cntv_cval_el0, + hypctx->vtimer_cpu.virt_timer.cntx_cval_el0); + WRITE_SPECIALREG(cntv_ctl_el0, + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0); + WRITE_SPECIALREG(cnthctl_el2, hyp->vtimer.cnthctl_el2); + WRITE_SPECIALREG(cntvoff_el2, hyp->vtimer.cntvoff_el2); + + /* Load the GICv3 registers */ + WRITE_SPECIALREG(ich_hcr_el2, hypctx->vgic_v3_regs.ich_hcr_el2); + WRITE_SPECIALREG(ich_vmcr_el2, + hypctx->vgic_v3_regs.ich_vmcr_el2); + switch (hypctx->vgic_v3_regs.ich_lr_num - 1) { +#define LOAD_LR(x) \ + case x: \ + WRITE_SPECIALREG(ich_lr ## x ##_el2, \ + hypctx->vgic_v3_regs.ich_lr_el2[x]) + LOAD_LR(15); + LOAD_LR(14); + LOAD_LR(13); + LOAD_LR(12); + LOAD_LR(11); + LOAD_LR(10); + LOAD_LR(9); + LOAD_LR(8); + LOAD_LR(7); + LOAD_LR(6); + LOAD_LR(5); + LOAD_LR(4); + LOAD_LR(3); + LOAD_LR(2); + LOAD_LR(1); + default: + LOAD_LR(0); +#undef LOAD_LR + } + + switch (hypctx->vgic_v3_regs.ich_apr_num - 1) { +#define LOAD_APR(x) \ + case x: \ + WRITE_SPECIALREG(ich_ap0r ## x ##_el2, \ + hypctx->vgic_v3_regs.ich_ap0r_el2[x]); \ + WRITE_SPECIALREG(ich_ap1r ## x ##_el2, \ + hypctx->vgic_v3_regs.ich_ap1r_el2[x]) + LOAD_APR(3); + LOAD_APR(2); + LOAD_APR(1); + default: + LOAD_APR(0); +#undef LOAD_APR + } + } +} + +static uint64_t +vmm_hyp_call_guest(struct hyp *hyp, struct hypctx *hypctx) +{ + struct hypctx host_hypctx; + uint64_t cntvoff_el2; + uint64_t ich_hcr_el2, ich_vmcr_el2, cnthctl_el2, cntkctl_el1; + uint64_t ret; + uint64_t s1e1r, hpfar_el2; + bool hpfar_valid; + + vmm_hyp_reg_store(&host_hypctx, NULL, false); + + /* Save the host special registers */ + cnthctl_el2 = READ_SPECIALREG(cnthctl_el2); + cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + cntvoff_el2 = READ_SPECIALREG(cntvoff_el2); + + ich_hcr_el2 = READ_SPECIALREG(ich_hcr_el2); + ich_vmcr_el2 = READ_SPECIALREG(ich_vmcr_el2); + + vmm_hyp_reg_restore(hypctx, hyp, true); + + /* Load the common hypervisor registers */ + WRITE_SPECIALREG(vttbr_el2, hyp->vttbr_el2); + + host_hypctx.mdcr_el2 = READ_SPECIALREG(mdcr_el2); + WRITE_SPECIALREG(mdcr_el2, hypctx->mdcr_el2); + + /* Call into the guest */ + ret = vmm_enter_guest(hypctx); + + WRITE_SPECIALREG(mdcr_el2, host_hypctx.mdcr_el2); + isb(); + + /* Store the exit info */ + hypctx->exit_info.far_el2 = READ_SPECIALREG(far_el2); + vmm_hyp_reg_store(hypctx, hyp, true); + + hpfar_valid = true; + if (ret == EXCP_TYPE_EL1_SYNC) { + switch (ESR_ELx_EXCEPTION(hypctx->tf.tf_esr)) { + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + /* + * The hpfar_el2 register is valid for: + * - Translation and Access faults. + * - Translation, Access, and permission faults on + * the translation table walk on the stage 1 tables. + * - A stage 2 Address size fault. + * + * As we only need it in the first 2 cases we can just + * exclude it on permission faults that are not from + * the stage 1 table walk. + * + * TODO: Add a case for Arm erratum 834220. + */ + if ((hypctx->tf.tf_esr & ISS_DATA_S1PTW) != 0) + break; + switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + hpfar_valid = false; + break; + } + break; + } + } + if (hpfar_valid) { + hypctx->exit_info.hpfar_el2 = READ_SPECIALREG(hpfar_el2); + } else { + /* + * TODO: There is a risk the at instruction could cause an + * exception here. We should handle it & return a failure. + */ + s1e1r = + arm64_address_translate_s1e1r(hypctx->exit_info.far_el2); + if (PAR_SUCCESS(s1e1r)) { + hpfar_el2 = (s1e1r & PAR_PA_MASK) >> PAR_PA_SHIFT; + hpfar_el2 <<= HPFAR_EL2_FIPA_SHIFT; + hypctx->exit_info.hpfar_el2 = hpfar_el2; + } else { + ret = EXCP_TYPE_REENTER; + } + } + + vmm_hyp_reg_restore(&host_hypctx, NULL, false); + + /* Restore the host special registers */ + WRITE_SPECIALREG(ich_hcr_el2, ich_hcr_el2); + WRITE_SPECIALREG(ich_vmcr_el2, ich_vmcr_el2); + + WRITE_SPECIALREG(cnthctl_el2, cnthctl_el2); + WRITE_SPECIALREG(cntkctl_el1, cntkctl_el1); + WRITE_SPECIALREG(cntvoff_el2, cntvoff_el2); + + return (ret); +} + +static uint64_t +vmm_hyp_read_reg(uint64_t reg) +{ + switch (reg) { + case HYP_REG_ICH_VTR: + return (READ_SPECIALREG(ich_vtr_el2)); + case HYP_REG_CNTHCTL: + return (READ_SPECIALREG(cnthctl_el2)); + } + + return (0); +} + +static int +vmm_clean_s2_tlbi(void) +{ + dsb(ishst); + __asm __volatile("tlbi alle1is"); + dsb(ish); + + return (0); +} + +static int +vm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_size_t eva, + bool final_only) +{ + uint64_t end, r, start; + uint64_t host_vttbr; + +#define TLBI_VA_SHIFT 12 +#define TLBI_VA_MASK ((1ul << 44) - 1) +#define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) +#define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) + + /* Switch to the guest vttbr */ + /* TODO: Handle Cortex-A57/A72 erratum 131936 */ + host_vttbr = READ_SPECIALREG(vttbr_el2); + WRITE_SPECIALREG(vttbr_el2, vttbr); + isb(); + + /* + * The CPU can cache the stage 1 + 2 combination so we need to ensure + * the stage 2 is invalidated first, then when this has completed we + * invalidate the stage 1 TLB. As we don't know which stage 1 virtual + * addresses point at the stage 2 IPA we need to invalidate the entire + * stage 1 TLB. + */ + + start = TLBI_VA(sva); + end = TLBI_VA(eva); + for (r = start; r < end; r += TLBI_VA_L3_INCR) { + /* Invalidate the stage 2 TLB entry */ + if (final_only) + __asm __volatile("tlbi ipas2le1is, %0" : : "r"(r)); + else + __asm __volatile("tlbi ipas2e1is, %0" : : "r"(r)); + } + /* Ensure the entry has been invalidated */ + dsb(ish); + /* Invalidate the stage 1 TLB. */ + __asm __volatile("tlbi vmalle1is"); + dsb(ish); + isb(); + + /* Switch back t othe host vttbr */ + WRITE_SPECIALREG(vttbr_el2, host_vttbr); + isb(); + + return (0); +} + +static int +vm_s2_tlbi_all(uint64_t vttbr) +{ + uint64_t host_vttbr; + + /* Switch to the guest vttbr */ + /* TODO: Handle Cortex-A57/A72 erratum 131936 */ + host_vttbr = READ_SPECIALREG(vttbr_el2); + WRITE_SPECIALREG(vttbr_el2, vttbr); + isb(); + + __asm __volatile("tlbi vmalls12e1is"); + dsb(ish); + isb(); + + /* Switch back t othe host vttbr */ + WRITE_SPECIALREG(vttbr_el2, host_vttbr); + isb(); + + return (0); +} + +static int +vmm_dc_civac(uint64_t start, uint64_t len) +{ + size_t line_size, end; + uint64_t ctr; + + ctr = READ_SPECIALREG(ctr_el0); + line_size = sizeof(int) << CTR_DLINE_SIZE(ctr); + end = start + len; + dsb(ishst); + /* Clean and Invalidate the D-cache */ + for (; start < end; start += line_size) + __asm __volatile("dc civac, %0" :: "r" (start) : "memory"); + dsb(ish); + return (0); +} + +static int +vmm_el2_tlbi(uint64_t type, uint64_t start, uint64_t len) +{ + uint64_t end, r; + + dsb(ishst); + switch (type) { + default: + case HYP_EL2_TLBI_ALL: + __asm __volatile("tlbi alle2" ::: "memory"); + break; + case HYP_EL2_TLBI_VA: + end = TLBI_VA(start + len); + start = TLBI_VA(start); + for (r = start; r < end; r += TLBI_VA_L3_INCR) { + __asm __volatile("tlbi vae2is, %0" :: "r"(r)); + } + break; + } + dsb(ish); + + return (0); +} + +uint64_t +vmm_hyp_enter(uint64_t handle, uint64_t x1, uint64_t x2, uint64_t x3, + uint64_t x4, uint64_t x5, uint64_t x6, uint64_t x7) +{ + uint64_t ret; + + switch (handle) { + case HYP_ENTER_GUEST: + do { + ret = vmm_hyp_call_guest((struct hyp *)x1, + (struct hypctx *)x2); + } while (ret == EXCP_TYPE_REENTER); + return (ret); + case HYP_READ_REGISTER: + return (vmm_hyp_read_reg(x1)); + case HYP_CLEAN_S2_TLBI: + return (vmm_clean_s2_tlbi()); + case HYP_DC_CIVAC: + return (vmm_dc_civac(x1, x2)); + case HYP_EL2_TLBI: + return (vmm_el2_tlbi(x1, x2, x3)); + case HYP_S2_TLBI_RANGE: + return (vm_s2_tlbi_range(x1, x2, x3, x4)); + case HYP_S2_TLBI_ALL: + return (vm_s2_tlbi_all(x1)); + case HYP_CLEANUP: /* Handled in vmm_hyp_exception.S */ + default: + break; + } + + return (0); +} diff --git a/sys/arm64/vmm/vmm_hyp_el2.S b/sys/arm64/vmm/vmm_hyp_el2.S new file mode 100644 index 000000000000..7b49d3144dff --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp_el2.S @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/param.h> + + .rodata + .align PAGE_SHIFT + .globl vmm_hyp_code +vmm_hyp_code: + .incbin "vmm_hyp_blob.bin" + .globl vmm_hyp_code_end +vmm_hyp_code_end: diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S new file mode 100644 index 000000000000..77cb8cfd6cd7 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -0,0 +1,384 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei <alexandru.elisei@gmail.com> + * Copyright (c) 2021 Andrew Turner + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include <machine/asm.h> +#include <machine/hypervisor.h> + +#include "assym.inc" +#include "hyp.h" + +.macro save_host_registers + /* TODO: Only store callee saved registers */ + sub sp, sp, #(32 * 8) + str x30, [sp, #(30 * 8)] + stp x28, x29, [sp, #(28 * 8)] + stp x26, x27, [sp, #(26 * 8)] + stp x24, x25, [sp, #(24 * 8)] + stp x22, x23, [sp, #(22 * 8)] + stp x20, x21, [sp, #(20 * 8)] + stp x18, x19, [sp, #(18 * 8)] + stp x16, x17, [sp, #(16 * 8)] + stp x14, x15, [sp, #(14 * 8)] + stp x12, x13, [sp, #(12 * 8)] + stp x10, x11, [sp, #(10 * 8)] + stp x8, x9, [sp, #(8 * 8)] + stp x6, x7, [sp, #(6 * 8)] + stp x4, x5, [sp, #(4 * 8)] + stp x2, x3, [sp, #(2 * 8)] + stp x0, x1, [sp, #(0 * 8)] +.endm + +.macro restore_host_registers + /* TODO: Only restore callee saved registers */ + ldp x0, x1, [sp, #(0 * 8)] + ldp x2, x3, [sp, #(2 * 8)] + ldp x4, x5, [sp, #(4 * 8)] + ldp x6, x7, [sp, #(6 * 8)] + ldp x8, x9, [sp, #(8 * 8)] + ldp x10, x11, [sp, #(10 * 8)] + ldp x12, x13, [sp, #(12 * 8)] + ldp x14, x15, [sp, #(14 * 8)] + ldp x16, x17, [sp, #(16 * 8)] + ldp x18, x19, [sp, #(18 * 8)] + ldp x20, x21, [sp, #(20 * 8)] + ldp x22, x23, [sp, #(22 * 8)] + ldp x24, x25, [sp, #(24 * 8)] + ldp x26, x27, [sp, #(26 * 8)] + ldp x28, x29, [sp, #(28 * 8)] + ldr x30, [sp, #(30 * 8)] + add sp, sp, #(32 * 8) +.endm + +.macro save_guest_registers + /* Back up x0 so we can use it as a temporary register */ + stp x0, x1, [sp, #-(2 * 8)]! + + /* Restore the hypctx pointer */ + mrs x0, tpidr_el2 + + stp x2, x3, [x0, #(TF_X + 2 * 8)] + stp x4, x5, [x0, #(TF_X + 4 * 8)] + stp x6, x7, [x0, #(TF_X + 6 * 8)] + stp x8, x9, [x0, #(TF_X + 8 * 8)] + stp x10, x11, [x0, #(TF_X + 10 * 8)] + stp x12, x13, [x0, #(TF_X + 12 * 8)] + stp x14, x15, [x0, #(TF_X + 14 * 8)] + stp x16, x17, [x0, #(TF_X + 16 * 8)] + stp x18, x19, [x0, #(TF_X + 18 * 8)] + stp x20, x21, [x0, #(TF_X + 20 * 8)] + stp x22, x23, [x0, #(TF_X + 22 * 8)] + stp x24, x25, [x0, #(TF_X + 24 * 8)] + stp x26, x27, [x0, #(TF_X + 26 * 8)] + stp x28, x29, [x0, #(TF_X + 28 * 8)] + + str lr, [x0, #(TF_LR)] + + /* Restore the saved x0 & x1 and save them */ + ldp x2, x3, [sp], #(2 * 8) + stp x2, x3, [x0, #(TF_X + 0 * 8)] +.endm + +.macro restore_guest_registers + /* + * Copy the guest x0 and x1 to the stack so we can restore them + * after loading the other registers. + */ + ldp x2, x3, [x0, #(TF_X + 0 * 8)] + stp x2, x3, [sp, #-(2 * 8)]! + + ldr lr, [x0, #(TF_LR)] + + ldp x28, x29, [x0, #(TF_X + 28 * 8)] + ldp x26, x27, [x0, #(TF_X + 26 * 8)] + ldp x24, x25, [x0, #(TF_X + 24 * 8)] + ldp x22, x23, [x0, #(TF_X + 22 * 8)] + ldp x20, x21, [x0, #(TF_X + 20 * 8)] + ldp x18, x19, [x0, #(TF_X + 18 * 8)] + ldp x16, x17, [x0, #(TF_X + 16 * 8)] + ldp x14, x15, [x0, #(TF_X + 14 * 8)] + ldp x12, x13, [x0, #(TF_X + 12 * 8)] + ldp x10, x11, [x0, #(TF_X + 10 * 8)] + ldp x8, x9, [x0, #(TF_X + 8 * 8)] + ldp x6, x7, [x0, #(TF_X + 6 * 8)] + ldp x4, x5, [x0, #(TF_X + 4 * 8)] + ldp x2, x3, [x0, #(TF_X + 2 * 8)] + + ldp x0, x1, [sp], #(2 * 8) +.endm + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .section ".vmm_vectors","ax" + .align 11 +hyp_init_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* Error EL2h */ + + vector hyp_init /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + .text + .align 11 +hyp_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vector el2_el2h_sync /* Synchronous EL2h */ + vector el2_el2h_irq /* IRQ EL2h */ + vector el2_el2h_fiq /* FIQ EL2h */ + vector el2_el2h_error /* Error EL2h */ + + vector el2_el1_sync64 /* Synchronous 64-bit EL1 */ + vector el2_el1_irq64 /* IRQ 64-bit EL1 */ + vector el2_el1_fiq64 /* FIQ 64-bit EL1 */ + vector el2_el1_error64 /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + +/* + * Initialize the hypervisor mode with a new exception vector table, translation + * table and stack. + * + * Expecting: + * x0 - translation tables physical address + * x1 - stack top virtual address + * x2 - TCR_EL2 value + * x3 - SCTLR_EL2 value + * x4 - VTCR_EL2 value + */ +LENTRY(handle_hyp_init) + /* Install the new exception vectors */ + adrp x6, hyp_vectors + add x6, x6, :lo12:hyp_vectors + msr vbar_el2, x6 + /* Set the stack top address */ + mov sp, x1 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 + /* Load the base address for the translation tables */ + msr ttbr0_el2, x0 + /* Invalidate the TLB */ + tlbi alle2 + /* Use the same memory attributes as EL1 */ + mrs x9, mair_el1 + msr mair_el2, x9 + /* Configure address translation */ + msr tcr_el2, x2 + isb + /* Set the system control register for EL2 */ + msr sctlr_el2, x3 + /* Set the Stage 2 translation control register */ + msr vtcr_el2, x4 + /* Return success */ + mov x0, #0 + /* MMU is up and running */ + ERET +LEND(handle_hyp_init) + +.macro do_world_switch_to_host + save_guest_registers + restore_host_registers + + /* Restore host VTTBR */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 +.endm + + +.macro handle_el2_excp type + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Test if the exception happened when the host was running */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* We got the exception while the guest was running */ + ldr x9, [sp], #16 + do_world_switch_to_host + mov x0, \type + ret + +1: + /* We got the exception while the host was running */ + ldr x9, [sp], #16 + mov x0, \type + ERET +.endm + + +LENTRY(handle_el2_el2h_sync) + handle_el2_excp #EXCP_TYPE_EL2_SYNC +LEND(handle_el2_el2h_sync) + +LENTRY(handle_el2_el2h_irq) + handle_el2_excp #EXCP_TYPE_EL2_IRQ +LEND(handle_el2_el2h_irq) + +LENTRY(handle_el2_el2h_fiq) + handle_el2_excp #EXCP_TYPE_EL2_FIQ +LEND(handle_el2_el2h_fiq) + +LENTRY(handle_el2_el2h_error) + handle_el2_excp #EXCP_TYPE_EL2_ERROR +LEND(handle_el2_el2h_error) + + +LENTRY(handle_el2_el1_sync64) + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Check for host hypervisor call */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + ldr x9, [sp], #16 /* Restore the temp register */ + bne 1f + + /* + * Called from the host + */ + + /* Check if this is a cleanup call and handle in a controlled state */ + cmp x0, #(HYP_CLEANUP) + b.eq vmm_cleanup + + str lr, [sp, #-16]! + bl vmm_hyp_enter + ldr lr, [sp], #16 + ERET + +1: /* Guest exception taken to EL2 */ + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_SYNC + ret +LEND(handle_el2_el1_sync64) + +/* + * We only trap IRQ, FIQ and SError exceptions when a guest is running. Do a + * world switch to host to handle these exceptions. + */ + +LENTRY(handle_el2_el1_irq64) + do_world_switch_to_host + str x9, [sp, #-16]! + mrs x9, ich_misr_el2 + cmp x9, xzr + beq 1f + mov x0, #EXCP_TYPE_MAINT_IRQ + b 2f +1: + mov x0, #EXCP_TYPE_EL1_IRQ +2: + ldr x9, [sp], #16 + ret +LEND(handle_el2_el1_irq) + +LENTRY(handle_el2_el1_fiq64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_FIQ + ret +LEND(handle_el2_el1_fiq64) + +LENTRY(handle_el2_el1_error64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_ERROR + ret +LEND(handle_el2_el1_error64) + + +/* + * Usage: + * uint64_t vmm_enter_guest(struct hypctx *hypctx) + * + * Expecting: + * x0 - hypctx address + */ +ENTRY(vmm_enter_guest) + /* Save hypctx address */ + msr tpidr_el2, x0 + + save_host_registers + restore_guest_registers + + /* Enter guest */ + ERET +END(vmm_enter_guest) + +/* + * Usage: + * void vmm_cleanup(uint64_t handle, void *hyp_stub_vectors) + * + * Expecting: + * x1 - physical address of hyp_stub_vectors + */ +LENTRY(vmm_cleanup) + /* Restore the stub vectors */ + msr vbar_el2, x1 + + /* Disable the MMU */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, #SCTLR_EL2_M + msr sctlr_el2, x2 + isb + + ERET +LEND(vmm_cleanup) diff --git a/sys/arm64/vmm/vmm_instruction_emul.c b/sys/arm64/vmm/vmm_instruction_emul.c new file mode 100644 index 000000000000..3d3326d6eda5 --- /dev/null +++ b/sys/arm64/vmm/vmm_instruction_emul.c @@ -0,0 +1,102 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> + +#include <machine/machdep.h> +#include <machine/vmm.h> +#else +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/_iovec.h> + +#include <machine/vmm.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <vmmapi.h> +#endif + +#include <machine/vmm_instruction_emul.h> + +int +vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging __unused, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + uint64_t val; + int error; + + if (vie->dir == VM_DIR_READ) { + error = memread(vcpu, gpa, &val, vie->access_size, memarg); + if (error) + goto out; + error = vm_set_register(vcpu, vie->reg, val); + } else { + error = vm_get_register(vcpu, vie->reg, &val); + if (error) + goto out; + /* Mask any unneeded bits from the register */ + if (vie->access_size < 8) + val &= (1ul << (vie->access_size * 8)) - 1; + error = memwrite(vcpu, gpa, val, vie->access_size, memarg); + } + +out: + return (error); +} + +int +vmm_emulate_register(struct vcpu *vcpu, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg) +{ + uint64_t val; + int error; + + if (vre->dir == VM_DIR_READ) { + error = regread(vcpu, &val, regarg); + if (error) + goto out; + error = vm_set_register(vcpu, vre->reg, val); + } else { + error = vm_get_register(vcpu, vre->reg, &val); + if (error) + goto out; + error = regwrite(vcpu, val, regarg); + } + +out: + return (error); +} diff --git a/sys/arm64/vmm/vmm_ktr.h b/sys/arm64/vmm/vmm_ktr.h new file mode 100644 index 000000000000..965f440ae874 --- /dev/null +++ b/sys/arm64/vmm/vmm_ktr.h @@ -0,0 +1,69 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include <sys/ktr.h> +#include <sys/pcpu.h> + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ +CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/sys/arm64/vmm/vmm_mmu.c b/sys/arm64/vmm/vmm_mmu.c new file mode 100644 index 000000000000..1f2d248a743b --- /dev/null +++ b/sys/arm64/vmm/vmm_mmu.c @@ -0,0 +1,430 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei <alexandru.elisei@gmail.com> + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_param.h> +#include <vm/vm_phys.h> + +#include <machine/atomic.h> +#include <machine/machdep.h> +#include <machine/vm.h> +#include <machine/vmm.h> +#include <machine/vmparam.h> + +#include "mmu.h" +#include "arm64.h" + +static struct mtx vmmpmap_mtx; +static pt_entry_t *l0; +static vm_paddr_t l0_paddr; + +bool +vmmpmap_init(void) +{ + vm_page_t m; + + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (false); + + l0_paddr = VM_PAGE_TO_PHYS(m); + l0 = (pd_entry_t *)PHYS_TO_DMAP(l0_paddr); + + mtx_init(&vmmpmap_mtx, "vmm pmap", NULL, MTX_DEF); + + return (true); +} + +static void +vmmpmap_release_l3(pd_entry_t l2e) +{ + pt_entry_t *l3 __diagused; + vm_page_t m; + int i; + + l3 = (pd_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + KASSERT(l3[i] == 0, ("%s: l3 still mapped: %p %lx", __func__, + &l3[i], l3[i])); + } + + m = PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +static void +vmmpmap_release_l2(pd_entry_t l1e) +{ + pt_entry_t *l2; + vm_page_t m; + int i; + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + if (l2[i] != 0) { + vmmpmap_release_l3(l2[i]); + } + } + + m = PHYS_TO_VM_PAGE(l1e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +static void +vmmpmap_release_l1(pd_entry_t l0e) +{ + pt_entry_t *l1; + vm_page_t m; + int i; + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + if (l1[i] != 0) { + vmmpmap_release_l2(l1[i]); + } + } + + m = PHYS_TO_VM_PAGE(l0e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +void +vmmpmap_fini(void) +{ + vm_page_t m; + int i; + + /* Remove the remaining entries */ + for (i = 0; i < L0_ENTRIES; i++) { + if (l0[i] != 0) { + vmmpmap_release_l1(l0[i]); + } + } + + m = PHYS_TO_VM_PAGE(l0_paddr); + vm_page_unwire_noq(m); + vm_page_free(m); + + mtx_destroy(&vmmpmap_mtx); +} + +uint64_t +vmmpmap_to_ttbr0(void) +{ + + return (l0_paddr); +} + +/* Returns a pointer to the level 1 table, allocating if needed. */ +static pt_entry_t * +vmmpmap_l1_table(vm_offset_t va) +{ + pt_entry_t new_l0e, l0e, *l1; + vm_page_t m; + int rv; + + m = NULL; +again: + l0e = atomic_load_64(&l0[pmap_l0_index(va)]); + if ((l0e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 1 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l0[pmap_l0_index(va)], l0e, new_l0e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l0e = new_l0e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + return (l1); +} + +static pt_entry_t * +vmmpmap_l2_table(vm_offset_t va) +{ + pt_entry_t new_l1e, l1e, *l1, *l2; + vm_page_t m; + int rv; + + l1 = vmmpmap_l1_table(va); + if (l1 == NULL) + return (NULL); + + m = NULL; +again: + l1e = atomic_load_64(&l1[pmap_l1_index(va)]); + if ((l1e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 2 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l1e = VM_PAGE_TO_PHYS(m) | L1_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l1[pmap_l1_index(va)], l1e, new_l1e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l1e = new_l1e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + return (l2); +} + +static pd_entry_t * +vmmpmap_l3_table(vm_offset_t va) +{ + pt_entry_t new_l2e, l2e, *l2, *l3; + vm_page_t m; + int rv; + + l2 = vmmpmap_l2_table(va); + if (l2 == NULL) + return (NULL); + + m = NULL; +again: + l2e = atomic_load_64(&l2[pmap_l2_index(va)]); + if ((l2e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 3 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l2e = VM_PAGE_TO_PHYS(m) | L2_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l2[pmap_l2_index(va)], l2e, new_l2e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l2e = new_l2e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l3 = (pt_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + return (l3); +} + +/* + * Creates an EL2 entry in the hyp_pmap. Similar to pmap_kenter. + */ +bool +vmmpmap_enter(vm_offset_t va, vm_size_t size, vm_paddr_t pa, vm_prot_t prot) +{ + pd_entry_t l3e, *l3; + + KASSERT((pa & L3_OFFSET) == 0, + ("%s: Invalid physical address", __func__)); + KASSERT((va & L3_OFFSET) == 0, + ("%s: Invalid virtual address", __func__)); + KASSERT((size & PAGE_MASK) == 0, + ("%s: Mapping is not page-sized", __func__)); + + l3e = ATTR_DEFAULT | L3_PAGE; + /* This bit is res1 at EL2 */ + l3e |= ATTR_S1_AP(ATTR_S1_AP_USER); + /* Only normal memory is used at EL2 */ + l3e |= ATTR_S1_IDX(VM_MEMATTR_DEFAULT); + + if ((prot & VM_PROT_EXECUTE) == 0) { + /* PXN is res0 at EL2. UXN is XN */ + l3e |= ATTR_S1_UXN; + } + if ((prot & VM_PROT_WRITE) == 0) { + l3e |= ATTR_S1_AP(ATTR_S1_AP_RO); + } + + while (size > 0) { + l3 = vmmpmap_l3_table(va); + if (l3 == NULL) + return (false); + +#ifdef INVARIANTS + /* + * Ensure no other threads can write to l3 between the KASSERT + * and store. + */ + mtx_lock(&vmmpmap_mtx); +#endif + KASSERT(atomic_load_64(&l3[pmap_l3_index(va)]) == 0, + ("%s: VA already mapped", __func__)); + + atomic_store_64(&l3[pmap_l3_index(va)], l3e | pa); +#ifdef INVARIANTS + mtx_unlock(&vmmpmap_mtx); +#endif + + size -= PAGE_SIZE; + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + return (true); +} + +void +vmmpmap_remove(vm_offset_t va, vm_size_t size, bool invalidate) +{ + pt_entry_t l0e, *l1, l1e, *l2, l2e; + pd_entry_t *l3, l3e, **l3_list; + vm_offset_t eva, va_next, sva; + size_t i; + + KASSERT((va & L3_OFFSET) == 0, + ("%s: Invalid virtual address", __func__)); + KASSERT((size & PAGE_MASK) == 0, + ("%s: Mapping is not page-sized", __func__)); + + if (invalidate) { + l3_list = malloc((size / PAGE_SIZE) * sizeof(l3_list[0]), + M_TEMP, M_WAITOK | M_ZERO); + } + + sva = va; + eva = va + size; + mtx_lock(&vmmpmap_mtx); + for (i = 0; va < eva; va = va_next) { + l0e = atomic_load_64(&l0[pmap_l0_index(va)]); + if (l0e == 0) { + va_next = (va + L0_SIZE) & ~L0_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + l1e = atomic_load_64(&l1[pmap_l1_index(va)]); + if (l1e == 0) { + va_next = (va + L1_SIZE) & ~L1_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + l2e = atomic_load_64(&l2[pmap_l2_index(va)]); + if (l2e == 0) { + va_next = (va + L2_SIZE) & ~L2_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); + + l3 = (pd_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + if (invalidate) { + l3e = atomic_load_64(&l3[pmap_l3_index(va)]); + MPASS(l3e != 0); + /* + * Mark memory as read-only so we can invalidate + * the cache. + */ + l3e &= ~ATTR_S1_AP_MASK; + l3e |= ATTR_S1_AP(ATTR_S1_AP_RO); + atomic_store_64(&l3[pmap_l3_index(va)], l3e); + + l3_list[i] = &l3[pmap_l3_index(va)]; + i++; + } else { + /* + * The caller is responsible for clearing the cache & + * handling the TLB + */ + atomic_store_64(&l3[pmap_l3_index(va)], 0); + } + + va_next = (va + L3_SIZE) & ~L3_OFFSET; + if (va_next < va) + va_next = eva; + } + mtx_unlock(&vmmpmap_mtx); + + if (invalidate) { + /* Invalidate the memory from the D-cache */ + vmm_call_hyp(HYP_DC_CIVAC, sva, size); + + for (i = 0; i < (size / PAGE_SIZE); i++) { + atomic_store_64(l3_list[i], 0); + } + + vmm_call_hyp(HYP_EL2_TLBI, HYP_EL2_TLBI_VA, sva, size); + + free(l3_list, M_TEMP); + } +} diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c new file mode 100644 index 000000000000..a929a60c9474 --- /dev/null +++ b/sys/arm64/vmm/vmm_reset.c @@ -0,0 +1,177 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 Alexandru Elisei <alexandru.elisei@gmail.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> + +#include <machine/armreg.h> +#include <machine/cpu.h> +#include <machine/hypervisor.h> + +#include "arm64.h" +#include "reset.h" + +/* + * Make the architecturally UNKNOWN value 0. As a bonus, we don't have to + * manually set all those RES0 fields. + */ +#define ARCH_UNKNOWN 0 +#define set_arch_unknown(reg) (memset(&(reg), ARCH_UNKNOWN, sizeof(reg))) + +void +reset_vm_el01_regs(void *vcpu) +{ + struct hypctx *el2ctx; + + el2ctx = vcpu; + + set_arch_unknown(el2ctx->tf); + + set_arch_unknown(el2ctx->actlr_el1); + set_arch_unknown(el2ctx->afsr0_el1); + set_arch_unknown(el2ctx->afsr1_el1); + set_arch_unknown(el2ctx->amair_el1); + set_arch_unknown(el2ctx->contextidr_el1); + set_arch_unknown(el2ctx->cpacr_el1); + set_arch_unknown(el2ctx->csselr_el1); + set_arch_unknown(el2ctx->elr_el1); + set_arch_unknown(el2ctx->esr_el1); + set_arch_unknown(el2ctx->far_el1); + set_arch_unknown(el2ctx->mair_el1); + set_arch_unknown(el2ctx->mdccint_el1); + set_arch_unknown(el2ctx->mdscr_el1); + set_arch_unknown(el2ctx->par_el1); + + /* + * Guest starts with: + * ~SCTLR_M: MMU off + * ~SCTLR_C: data cache off + * SCTLR_CP15BEN: memory barrier instruction enable from EL0; RAO/WI + * ~SCTLR_I: instruction cache off + */ + el2ctx->sctlr_el1 = SCTLR_RES1; + el2ctx->sctlr_el1 &= ~SCTLR_M & ~SCTLR_C & ~SCTLR_I; + el2ctx->sctlr_el1 |= SCTLR_CP15BEN; + + set_arch_unknown(el2ctx->sp_el0); + set_arch_unknown(el2ctx->tcr_el1); + set_arch_unknown(el2ctx->tpidr_el0); + set_arch_unknown(el2ctx->tpidr_el1); + set_arch_unknown(el2ctx->tpidrro_el0); + set_arch_unknown(el2ctx->ttbr0_el1); + set_arch_unknown(el2ctx->ttbr1_el1); + set_arch_unknown(el2ctx->vbar_el1); + set_arch_unknown(el2ctx->spsr_el1); + + set_arch_unknown(el2ctx->dbgbcr_el1); + set_arch_unknown(el2ctx->dbgbvr_el1); + set_arch_unknown(el2ctx->dbgwcr_el1); + set_arch_unknown(el2ctx->dbgwvr_el1); + + el2ctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0) & PMCR_N_MASK; + /* PMCR_LC is unknown when AArch32 is supported or RES1 otherwise */ + el2ctx->pmcr_el0 |= PMCR_LC; + set_arch_unknown(el2ctx->pmccntr_el0); + set_arch_unknown(el2ctx->pmccfiltr_el0); + set_arch_unknown(el2ctx->pmcntenset_el0); + set_arch_unknown(el2ctx->pmintenset_el1); + set_arch_unknown(el2ctx->pmovsset_el0); + set_arch_unknown(el2ctx->pmuserenr_el0); + memset(el2ctx->pmevcntr_el0, 0, sizeof(el2ctx->pmevcntr_el0)); + memset(el2ctx->pmevtyper_el0, 0, sizeof(el2ctx->pmevtyper_el0)); +} + +void +reset_vm_el2_regs(void *vcpu) +{ + struct hypctx *el2ctx; + uint64_t cpu_aff, vcpuid; + + el2ctx = vcpu; + vcpuid = vcpu_vcpuid(el2ctx->vcpu); + + /* + * Set the Hypervisor Configuration Register: + * + * HCR_RW: use AArch64 for EL1 + * HCR_TID3: handle ID registers in the vmm to privide a common + * set of featers on all vcpus + * HCR_TWI: Trap WFI to the hypervisor + * HCR_BSU_IS: barrier instructions apply to the inner shareable + * domain + * HCR_FB: broadcast maintenance operations + * HCR_AMO: route physical SError interrupts to EL2 + * HCR_IMO: route physical IRQ interrupts to EL2 + * HCR_FMO: route physical FIQ interrupts to EL2 + * HCR_SWIO: turn set/way invalidate into set/way clean and + * invalidate + * HCR_VM: use stage 2 translation + */ + el2ctx->hcr_el2 = HCR_RW | HCR_TID3 | HCR_TWI | HCR_BSU_IS | HCR_FB | + HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO | HCR_VM; + + /* TODO: Trap all extensions we don't support */ + el2ctx->mdcr_el2 = 0; + /* PMCR_EL0.N is read from MDCR_EL2.HPMN */ + el2ctx->mdcr_el2 |= (el2ctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT; + + el2ctx->vmpidr_el2 = VMPIDR_EL2_RES1; + /* The guest will detect a multi-core, single-threaded CPU */ + el2ctx->vmpidr_el2 &= ~VMPIDR_EL2_U & ~VMPIDR_EL2_MT; + /* + * Generate the guest MPIDR value. We only support 16 CPUs at affinity + * level 0 to simplify the vgicv3 driver (see writing sgi1r_el1). + */ + cpu_aff = (vcpuid & 0xf) << MPIDR_AFF0_SHIFT | + ((vcpuid >> 4) & 0xff) << MPIDR_AFF1_SHIFT | + ((vcpuid >> 12) & 0xff) << MPIDR_AFF2_SHIFT | + ((vcpuid >> 20) & 0xff) << MPIDR_AFF3_SHIFT; + el2ctx->vmpidr_el2 |= cpu_aff; + + /* Use the same CPU identification information as the host */ + el2ctx->vpidr_el2 = CPU_IMPL_TO_MIDR(CPU_IMPL_ARM); + el2ctx->vpidr_el2 |= CPU_VAR_TO_MIDR(0); + el2ctx->vpidr_el2 |= CPU_ARCH_TO_MIDR(0xf); + el2ctx->vpidr_el2 |= CPU_PART_TO_MIDR(CPU_PART_FOUNDATION); + el2ctx->vpidr_el2 |= CPU_REV_TO_MIDR(0); + + /* + * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD + * and floating point functionality to EL2. + */ + el2ctx->cptr_el2 = CPTR_RES1; + /* + * Disable interrupts in the guest. The guest OS will re-enable + * them. + */ + el2ctx->tf.tf_spsr = PSR_D | PSR_A | PSR_I | PSR_F; + /* Use the EL1 stack when taking exceptions to EL1 */ + el2ctx->tf.tf_spsr |= PSR_M_EL1h; +} diff --git a/sys/arm64/vmm/vmm_stat.c b/sys/arm64/vmm/vmm_stat.c new file mode 100644 index 000000000000..858ce980843a --- /dev/null +++ b/sys/arm64/vmm/vmm_stat.c @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <machine/machdep.h> +#include <machine/vmm.h> +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vcpu *vcpu, int index, int count, int *num_stats, + uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i, tocopy; + + if (index < 0 || count < 0) + return (EINVAL); + + if (index > vst_num_elems) + return (ENOENT); + + if (index == vst_num_elems) { + *num_stats = 0; + return (0); + } + + tocopy = min(vst_num_elems - index, count); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vcpu); + memcpy(buf, stats + index, tocopy * sizeof(stats[0])); + *num_stats = tocopy; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception"); +VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted"); +VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted"); +VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted"); +VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted"); +VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort"); +VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort"); +VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception"); +VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq"); +VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt"); +VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception"); +VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception"); diff --git a/sys/arm64/vmm/vmm_stat.h b/sys/arm64/vmm/vmm_stat.h new file mode 100644 index 000000000000..b0a06ef79253 --- /dev/null +++ b/sys/arm64/vmm/vmm_stat.h @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vcpu *vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +int vmm_stat_copy(struct vcpu *vcpu, int index, int count, + int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vcpu *vcpu, struct vmm_stat_type *vst, int statidx, + uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vcpu *vcpu, struct vmm_stat_type *vst, int statidx, + uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static void __inline +vmm_stat_incr(struct vcpu *vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vcpu, vst, 0, x); +#endif +} + +static void __inline +vmm_stat_set(struct vcpu *vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_WFI); +VMM_STAT_DECLARE(VMEXIT_WFE); +VMM_STAT_DECLARE(VMEXIT_HVC); +VMM_STAT_DECLARE(VMEXIT_MSR); +VMM_STAT_DECLARE(VMEXIT_DATA_ABORT); +VMM_STAT_DECLARE(VMEXIT_INSN_ABORT); +VMM_STAT_DECLARE(VMEXIT_UNHANDLED_SYNC); +VMM_STAT_DECLARE(VMEXIT_IRQ); +VMM_STAT_DECLARE(VMEXIT_FIQ); +VMM_STAT_DECLARE(VMEXIT_UNHANDLED_EL2); +VMM_STAT_DECLARE(VMEXIT_UNHANDLED); +#endif diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 625684a91534..cc217b3ce78f 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -116,6 +116,39 @@ arm64/iommu/smmu_quirks.c optional iommu dev/iommu/busdma_iommu.c optional iommu dev/iommu/iommu_gas.c optional iommu +arm64/vmm/vmm.c optional vmm +arm64/vmm/vmm_dev.c optional vmm +arm64/vmm/vmm_instruction_emul.c optional vmm +arm64/vmm/vmm_stat.c optional vmm +arm64/vmm/vmm_arm64.c optional vmm +arm64/vmm/vmm_reset.c optional vmm +arm64/vmm/vmm_call.S optional vmm +arm64/vmm/vmm_hyp_exception.S optional vmm \ + compile-with "${NORMAL_C:N-fsanitize*:N-mbranch-protection*} -fpie" \ + no-obj +arm64/vmm/vmm_hyp.c optional vmm \ + compile-with "${NORMAL_C:N-fsanitize*:N-mbranch-protection*} -fpie" \ + no-obj +vmm_hyp_blob.elf.full optional vmm \ + dependency "vmm_hyp.o vmm_hyp_exception.o" \ + compile-with "${SYSTEM_LD_BASECMD} -o ${.TARGET} ${.ALLSRC} --defsym=text_start='0x0'" \ + no-obj no-implicit-rule +vmm_hyp_blob.elf optional vmm \ + dependency "vmm_hyp_blob.elf.full" \ + compile-with "${OBJCOPY} --strip-debug ${.ALLSRC} ${.TARGET}" \ + no-obj no-implicit-rule +vmm_hyp_blob.bin optional vmm \ + dependency vmm_hyp_blob.elf \ + compile-with "${OBJCOPY} --output-target=binary ${.ALLSRC} ${.TARGET}" \ + no-obj no-implicit-rule +arm64/vmm/vmm_hyp_el2.S optional vmm \ + dependency vmm_hyp_blob.bin +arm64/vmm/vmm_mmu.c optional vmm +arm64/vmm/io/vgic.c optional vmm +arm64/vmm/io/vgic_v3.c optional vmm +arm64/vmm/io/vgic_if.m optional vmm +arm64/vmm/io/vtimer.c optional vmm + crypto/armv8/armv8_crypto.c optional armv8crypto armv8_crypto_wrap.o optional armv8crypto \ dependency "$S/crypto/armv8/armv8_crypto_wrap.c" \ diff --git a/sys/conf/ldscript.arm64 b/sys/conf/ldscript.arm64 index 185b4485e07a..afb2687f0368 100644 --- a/sys/conf/ldscript.arm64 +++ b/sys/conf/ldscript.arm64 @@ -6,6 +6,7 @@ SECTIONS { /* Read-only sections, merged into text segment: */ . = text_start; /* This is set using --defsym= on the command line. */ + .vmm_vectors : { *(.vmm_vectors) } .text : { *(.text) @@ -16,6 +17,7 @@ SECTIONS } =0x9090 _etext = .; PROVIDE (etext = .); + .fini : { *(.fini) } =0x9090 .rodata : { *(.rodata*) *(.gnu.linkonce.r*) } .rodata1 : { *(.rodata1) } diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 index ab4b01b2e38a..e36f856ecb04 100644 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -19,6 +19,9 @@ EMUL_SWP opt_global.h # EFI Runtime services support EFIRT opt_efirt.h +# Bhyve +VMM opt_global.h + # SoC Support SOC_ALLWINNER_A64 opt_soc.h SOC_ALLWINNER_H5 opt_soc.h diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 606ab4cb0536..dcd9e25b1cb3 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -841,7 +841,9 @@ _sgx= sgx _sgx_linux= sgx_linux _smartpqi= smartpqi _p2sb= p2sb +.endif +.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) .if ${KERN_OPTS:MSMP} _vmm= vmm diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index a950c5a82d13..a67797276bae 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -3,31 +3,79 @@ KMOD= vmm -SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h -SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h -DPSRCS+= vmx_assym.h svm_assym.h -DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc +SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h CFLAGS+= -DVMM_KEEP_STATS -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm/io # generic vmm support -.PATH: ${SRCTOP}/sys/amd64/vmm +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm SRCS+= vmm.c \ vmm_dev.c \ - vmm_host.c \ vmm_instruction_emul.c \ + vmm_stat.c + +.if ${MACHINE_CPUARCH} == "aarch64" +DPSRCS+= assym.inc + +# TODO: Add the new EL2 code +SRCS+= vmm_arm64.c \ + vmm_reset.c \ + vmm_call.S \ + vmm_mmu.c \ + vmm_hyp_el2.S + +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io +SRCS+= vgic.c \ + vgic_if.h \ + vgic_if.c \ + vgic_v3.c \ + vtimer.c + +SRCS+= vmm_hyp_exception.S vmm_hyp.c + +CLEANFILES+= vmm_hyp_blob.elf.full +CLEANFILES+= vmm_hyp_blob.elf vmm_hyp_blob.bin + +vmm_hyp_exception.o: vmm_hyp_exception.S + ${CC} -c -x assembler-with-cpp -DLOCORE \ + ${CFLAGS:N-fsanitize*:N-mbranch-protection*} \ + ${.IMPSRC} -o ${.TARGET} -fpie + +vmm_hyp.o: vmm_hyp.c + ${CC} -c ${CFLAGS:N-fsanitize*:N-mbranch-protection*} \ + ${.IMPSRC} -o ${.TARGET} -fpie + +vmm_hyp_blob.elf.full: vmm_hyp_exception.o vmm_hyp.o + ${LD} -m ${LD_EMULATION} -Bdynamic -T ${SYSDIR}/conf/ldscript.arm64 \ + ${_LDFLAGS} --no-warn-mismatch --warn-common --export-dynamic \ + --dynamic-linker /red/herring -X -o ${.TARGET} ${.ALLSRC} \ + --defsym=text_start='0x0' + +vmm_hyp_blob.elf: vmm_hyp_blob.elf.full + ${OBJCOPY} --strip-debug ${.ALLSRC} ${.TARGET} + +vmm_hyp_blob.bin: vmm_hyp_blob.elf + ${OBJCOPY} --output-target=binary ${.ALLSRC} ${.TARGET} + +vmm_hyp_el2.o: vmm_hyp_blob.bin + +.elif ${MACHINE_CPUARCH} == "amd64" +DPSRCS+= vmx_assym.h svm_assym.h +DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc + +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd + +SRCS+= vmm_host.c \ vmm_ioport.c \ vmm_lapic.c \ vmm_mem.c \ - vmm_stat.c \ vmm_util.c \ x86.c -.PATH: ${SRCTOP}/sys/amd64/vmm/io +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io SRCS+= iommu.c \ ppt.c \ vatpic.c \ @@ -62,10 +110,11 @@ SRCS+= vmcb.c \ SRCS.BHYVE_SNAPSHOT= vmm_snapshot.c -CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o +CLEANFILES+= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h +.endif vmx_assym.h: vmx_genassym.o sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} @@ -81,6 +130,9 @@ svm_support.o: ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} +hyp_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} + vmx_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ -fcommon ${.IMPSRC} |