aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDoug Rabson <dfr@FreeBSD.org>2009-03-11 15:30:12 +0000
committerDoug Rabson <dfr@FreeBSD.org>2009-03-11 15:30:12 +0000
commit126780243806a6b6cf345b9b548f68010e921aec (patch)
tree633dc40712503605071c68975d2c972638554000
parent802e54dc1f4ac7ebd3cc3f0091d2b825780a3c51 (diff)
downloadsrc-126780243806a6b6cf345b9b548f68010e921aec.tar.gz
src-126780243806a6b6cf345b9b548f68010e921aec.zip
Merge in support for Xen HVM on amd64 architecture.
Notes
Notes: svn path=/head/; revision=189699
-rw-r--r--sys/amd64/amd64/machdep.c8
-rw-r--r--sys/amd64/conf/XENHVM160
-rw-r--r--sys/amd64/include/pcpu.h21
-rw-r--r--sys/amd64/include/xen/hypercall.h415
-rw-r--r--sys/amd64/include/xen/synch_bitops.h129
-rw-r--r--sys/amd64/include/xen/xen-os.h296
-rw-r--r--sys/amd64/include/xen/xenfunc.h83
-rw-r--r--sys/amd64/include/xen/xenpmap.h227
-rw-r--r--sys/amd64/include/xen/xenvar.h122
-rw-r--r--sys/conf/files37
-rw-r--r--sys/conf/options.amd642
-rw-r--r--sys/dev/xen/balloon/balloon.c369
-rw-r--r--sys/dev/xen/blkfront/blkfront.c65
-rw-r--r--sys/dev/xen/console/console.c54
-rw-r--r--sys/dev/xen/console/xencons_ring.c23
-rw-r--r--sys/dev/xen/netfront/netfront.c315
-rw-r--r--sys/dev/xen/xenpci/evtchn.c418
-rw-r--r--sys/dev/xen/xenpci/machine_reboot.c80
-rw-r--r--sys/dev/xen/xenpci/xenpci.c399
-rw-r--r--sys/dev/xen/xenpci/xenpcivar.h44
-rw-r--r--sys/i386/include/xen/xenpmap.h6
-rw-r--r--sys/xen/evtchn/evtchn.c220
-rw-r--r--sys/xen/evtchn/evtchn_dev.c14
-rw-r--r--sys/xen/features.c6
-rw-r--r--sys/xen/features.h20
-rw-r--r--sys/xen/gnttab.c158
-rw-r--r--sys/xen/gnttab.h15
-rw-r--r--sys/xen/hypervisor.h17
-rw-r--r--sys/xen/interface/arch-x86/xen.h5
-rw-r--r--sys/xen/interface/hvm/params.h2
-rw-r--r--sys/xen/reboot.c262
-rw-r--r--sys/xen/xen_intr.h64
-rw-r--r--sys/xen/xenbus/xenbus_probe.c1
-rw-r--r--sys/xen/xenbus/xenbus_xs.c35
34 files changed, 3561 insertions, 531 deletions
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 6ba3820e7fbd..3a2b17026e02 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1494,6 +1494,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
if (env != NULL)
strlcpy(kernelname, env, sizeof(kernelname));
+#ifdef XENHVM
+ if (inw(0x10) == 0x49d2) {
+ if (bootverbose)
+ printf("Xen detected: disabling emulated block and network devices\n");
+ outw(0x10, 3);
+ }
+#endif
+
/* Location of kernel stack for locore */
return ((u_int64_t)thread0.td_pcb);
}
diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM
new file mode 100644
index 000000000000..ed2f70f96d9c
--- /dev/null
+++ b/sys/amd64/conf/XENHVM
@@ -0,0 +1,160 @@
+#
+# XENHVM -- Xen HVM kernel configuration file for FreeBSD/amd64
+#
+# For more information on this file, please read the config(5) manual page,
+# and/or the handbook section on Kernel Configuration Files:
+#
+# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
+#
+# The handbook is also available locally in /usr/share/doc/handbook
+# if you've installed the doc distribution, otherwise always see the
+# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
+# latest information.
+#
+# An exhaustive list of options and more detailed explanations of the
+# device lines is also present in the ../../conf/NOTES and NOTES files.
+# If you are in doubt as to the purpose or necessity of a line, check first
+# in NOTES.
+#
+# $FreeBSD$
+
+cpu HAMMER
+ident GENERIC
+
+# To statically compile in device wiring instead of /boot/device.hints
+#hints "GENERIC.hints" # Default places to look for devices.
+
+# Use the following to compile in values accessible to the kernel
+# through getenv() (or kenv(1) in userland). The format of the file
+# is 'variable=value', see kenv(1)
+#
+# env "GENERIC.env"
+
+makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols
+makeoptions MODULES_OVERRIDE=""
+
+options SCHED_ULE # ULE scheduler
+options PREEMPTION # Enable kernel thread preemption
+options INET # InterNETworking
+options INET6 # IPv6 communications protocols
+options SCTP # Stream Control Transmission Protocol
+options FFS # Berkeley Fast Filesystem
+options SOFTUPDATES # Enable FFS soft updates support
+options UFS_ACL # Support for access control lists
+options UFS_DIRHASH # Improve performance on big directories
+options UFS_GJOURNAL # Enable gjournal-based UFS journaling
+options MD_ROOT # MD is a potential root device
+options NFSCLIENT # Network Filesystem Client
+options NFSSERVER # Network Filesystem Server
+options NFSLOCKD # Network Lock Manager
+options NFS_ROOT # NFS usable as /, requires NFSCLIENT
+options MSDOSFS # MSDOS Filesystem
+options CD9660 # ISO 9660 Filesystem
+options PROCFS # Process filesystem (requires PSEUDOFS)
+options PSEUDOFS # Pseudo-filesystem framework
+options GEOM_PART_GPT # GUID Partition Tables.
+options GEOM_LABEL # Provides labelization
+options COMPAT_43TTY # BSD 4.3 TTY compat (sgtty)
+options COMPAT_IA32 # Compatible with i386 binaries
+options COMPAT_FREEBSD4 # Compatible with FreeBSD4
+options COMPAT_FREEBSD5 # Compatible with FreeBSD5
+options COMPAT_FREEBSD6 # Compatible with FreeBSD6
+options COMPAT_FREEBSD7 # Compatible with FreeBSD7
+options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI
+options KTRACE # ktrace(1) support
+options STACK # stack(9) support
+options SYSVSHM # SYSV-style shared memory
+options SYSVMSG # SYSV-style message queues
+options SYSVSEM # SYSV-style semaphores
+options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
+options KBD_INSTALL_CDEV # install a CDEV entry in /dev
+options STOP_NMI # Stop CPUS using NMI instead of IPI
+options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4)
+options AUDIT # Security event auditing
+#options KDTRACE_FRAME # Ensure frames are compiled in
+#options KDTRACE_HOOKS # Kernel DTrace hooks
+
+# Debugging for use in -current
+options KDB # Enable kernel debugger support.
+options DDB # Support DDB.
+options GDB # Support remote GDB.
+options INVARIANTS # Enable calls of extra sanity checking
+options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS
+options WITNESS # Enable checks to detect deadlocks and cycles
+options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed
+
+# Make an SMP-capable kernel by default
+options SMP # Symmetric MultiProcessor Kernel
+
+# CPU frequency control
+device cpufreq
+
+# Bus support.
+device acpi
+device pci
+
+# Floppy drives
+device fdc
+
+# Xen HVM support
+options XENHVM
+device xenpci
+
+# ATA and ATAPI devices
+device ata
+device atadisk # ATA disk drives
+device ataraid # ATA RAID drives
+device atapicd # ATAPI CDROM drives
+device atapifd # ATAPI floppy drives
+device atapist # ATAPI tape drives
+options ATA_STATIC_ID # Static device numbering
+
+# SCSI peripherals
+device scbus # SCSI bus (required for SCSI)
+device ch # SCSI media changers
+device da # Direct Access (disks)
+device sa # Sequential Access (tape etc)
+device cd # CD
+device pass # Passthrough device (direct SCSI access)
+device ses # SCSI Environmental Services (and SAF-TE)
+
+
+# atkbdc0 controls both the keyboard and the PS/2 mouse
+device atkbdc # AT keyboard controller
+device atkbd # AT keyboard
+device psm # PS/2 mouse
+
+device kbdmux # keyboard multiplexer
+
+device vga # VGA video card driver
+
+device splash # Splash screen and screen saver support
+
+# syscons is the default console driver, resembling an SCO console
+device sc
+
+device agp # support several AGP chipsets
+
+# Serial (COM) ports
+device uart # Generic UART driver
+
+# PCI Ethernet NICs that use the common MII bus controller code.
+# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
+device miibus # MII bus support
+device re # RealTek 8139C+/8169/8169S/8110S
+
+# Pseudo devices.
+device loop # Network loopback
+device random # Entropy device
+device ether # Ethernet support
+device tun # Packet tunnel.
+device pty # BSD-style compatibility pseudo ttys
+device md # Memory "disks"
+device gif # IPv6 and IPv4 tunneling
+device faith # IPv6-to-IPv4 relaying (translation)
+device firmware # firmware assist module
+
+# The `bpf' device enables the Berkeley Packet Filter.
+# Be aware of the administrative consequences of enabling this!
+# Note that 'bpf' is required for DHCP.
+device bpf # Berkeley packet filter
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index e9faf282c77a..23818ca81328 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -33,6 +33,24 @@
#error "sys/cdefs.h is a prerequisite for this file"
#endif
+#if defined(XEN) || defined(XENHVM)
+#ifndef NR_VIRQS
+#define NR_VIRQS 24
+#endif
+#ifndef NR_IPIS
+#define NR_IPIS 2
+#endif
+#endif
+
+#ifdef XENHVM
+#define PCPU_XEN_FIELDS \
+ ; \
+ unsigned int pc_last_processed_l1i; \
+ unsigned int pc_last_processed_l2i
+#else
+#define PCPU_XEN_FIELDS
+#endif
+
/*
* The SMP parts are setup in pmap.c and locore.s for the BSP, and
* mp_machdep.c sets up the data for the AP's to "see" when they awake.
@@ -49,7 +67,8 @@
register_t pc_scratch_rsp; /* User %rsp in syscall */ \
u_int pc_apic_id; \
u_int pc_acpi_id; /* ACPI CPU id */ \
- struct user_segment_descriptor *pc_gs32p
+ struct user_segment_descriptor *pc_gs32p \
+ PCPU_XEN_FIELDS
#ifdef _KERNEL
diff --git a/sys/amd64/include/xen/hypercall.h b/sys/amd64/include/xen/hypercall.h
new file mode 100644
index 000000000000..50fa376ff90b
--- /dev/null
+++ b/sys/amd64/include/xen/hypercall.h
@@ -0,0 +1,415 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * 64-bit updates:
+ * Benjamin Liu <benjamin.liu@intel.com>
+ * Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __MACHINE_XEN_HYPERCALL_H__
+#define __MACHINE_XEN_HYPERCALL_H__
+
+#include <sys/systm.h>
+
+#ifndef __XEN_HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+#define ENOXENSYS 38
+#define CONFIG_XEN_COMPAT 0x030002
+#define __must_check
+
+#ifdef XEN
+#define HYPERCALL_STR(name) \
+ "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
+#else
+#define HYPERCALL_STR(name) \
+ "mov $("STR(__HYPERVISOR_##name)" * 32),%%eax; "\
+ "add hypercall_stubs(%%rip),%%rax; " \
+ "call *%%rax"
+#endif
+
+#define _hypercall0(type, name) \
+({ \
+ type __res; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res) \
+ : \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ type __res; \
+ long __ign1; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1) \
+ : "1" ((long)(a1)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ type __res; \
+ long __ign1, __ign2; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
+ : "1" ((long)(a1)), "2" ((long)(a2)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ register long __arg4 __asm__("r10") = (long)(a4); \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3), "+r" (__arg4) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ register long __arg4 __asm__("r10") = (long)(a4); \
+ register long __arg5 __asm__("r8") = (long)(a5); \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+ const trap_info_t *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+ unsigned long *frame_list, unsigned int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+ unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+ unsigned long event_address, unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address, syscall_address);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+ int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+ int cmd, unsigned long arg)
+{
+ return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op, cmd, arg);
+}
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+ uint64_t timeout)
+{
+ return _hypercall1(long, set_timer_op, timeout);
+}
+
+static inline int __must_check
+HYPERVISOR_platform_op(
+ struct xen_platform_op *platform_op)
+{
+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+ unsigned int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+ unsigned int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+ unsigned long ma, unsigned long word)
+{
+ return _hypercall2(int, update_descriptor, ma, word);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+ unsigned int cmd, void *arg)
+{
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+ multicall_entry_t *call_list, unsigned int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+ unsigned long va, uint64_t new_val, unsigned long flags)
+{
+ return _hypercall3(int, update_va_mapping, va, new_val, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOXENSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+ int cmd, unsigned int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOXENSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+ unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long va, uint64_t new_val, unsigned long flags, domid_t domid)
+{
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+ unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+ int cmd, unsigned int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_set_segment_base(
+ int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+ unsigned long srec)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = SHUTDOWN_suspend
+ };
+
+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOXENSYS)
+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+#endif
+
+ return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+ unsigned long op, void *arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+ int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+ int cmd, const void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+ int op, void *arg)
+{
+ return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+ unsigned long op, void *args)
+{
+ return _hypercall2(int, kexec_op, op, args);
+}
+
+#undef __must_check
+
+#endif /* __MACHINE_XEN_HYPERCALL_H__ */
diff --git a/sys/amd64/include/xen/synch_bitops.h b/sys/amd64/include/xen/synch_bitops.h
new file mode 100644
index 000000000000..746687aa91bd
--- /dev/null
+++ b/sys/amd64/include/xen/synch_bitops.h
@@ -0,0 +1,129 @@
+#ifndef __XEN_SYNCH_BITOPS_H__
+#define __XEN_SYNCH_BITOPS_H__
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ * Heavily modified to provide guaranteed strong synchronisation
+ * when communicating with Xen or other guest OSes running on other CPUs.
+ */
+
+
+#define ADDR (*(volatile long *) addr)
+
+static __inline__ void synch_set_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btsl %1,%0"
+ : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btrl %1,%0"
+ : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_change_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btcl %1,%0"
+ : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "lock btsl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "lock btrl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__ (
+ "lock btcl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+struct __synch_xchg_dummy { unsigned long a[100]; };
+#define __synch_xg(x) ((volatile struct __synch_xchg_dummy *)(x))
+
+#define synch_cmpxchg(ptr, old, new) \
+((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
+ (unsigned long)(old), \
+ (unsigned long)(new), \
+ sizeof(*(ptr))))
+
+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ __asm__ __volatile__("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ __asm__ __volatile__("lock; cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ __asm__ __volatile__("lock; cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
+{
+ return ((1UL << (nr & 31)) &
+ (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+}
+
+static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "btl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
+ return oldbit;
+}
+
+#define synch_test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ synch_const_test_bit((nr),(addr)) : \
+ synch_var_test_bit((nr),(addr)))
+
+#endif /* __XEN_SYNCH_BITOPS_H__ */
diff --git a/sys/amd64/include/xen/xen-os.h b/sys/amd64/include/xen/xen-os.h
new file mode 100644
index 000000000000..163e7f2e0574
--- /dev/null
+++ b/sys/amd64/include/xen/xen-os.h
@@ -0,0 +1,296 @@
+/******************************************************************************
+ * os.h
+ *
+ * random collection of macros and definition
+ */
+
+#ifndef _XEN_OS_H_
+#define _XEN_OS_H_
+
+#ifdef PAE
+#define CONFIG_X86_PAE
+#endif
+
+#if !defined(__XEN_INTERFACE_VERSION__)
+/*
+ * Can update to a more recent version when we implement
+ * the hypercall page
+ */
+#define __XEN_INTERFACE_VERSION__ 0x00030204
+#endif
+
+#include <xen/interface/xen.h>
+
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+extern int gdtset;
+
+extern shared_info_t *HYPERVISOR_shared_info;
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ __asm__ __volatile__ ( "rep;nop" : : : "memory" );
+}
+#define cpu_relax() rep_nop()
+
+/* crude memory allocator for memory allocation early in
+ * boot
+ */
+void *bootmem_alloc(unsigned int size);
+void bootmem_free(void *ptr, unsigned int size);
+
+
+/* Everything below this point is not included by assembler (.S) files. */
+#ifndef __ASSEMBLY__
+
+void printk(const char *fmt, ...);
+
+/* some function prototypes */
+void trap_init(void);
+
+#define likely(x) __builtin_expect((x),1)
+#define unlikely(x) __builtin_expect((x),0)
+
+#ifndef XENHVM
+
+/*
+ * STI/CLI equivalents. These basically set and clear the virtual
+ * event_enable flag in teh shared_info structure. Note that when
+ * the enable bit is set, there may be pending events to be handled.
+ * We may therefore call into do_hypervisor_callback() directly.
+ */
+
+#define __cli() \
+do { \
+ vcpu_info_t *_vcpu; \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ _vcpu->evtchn_upcall_mask = 1; \
+ barrier(); \
+} while (0)
+
+#define __sti() \
+do { \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ _vcpu->evtchn_upcall_mask = 0; \
+ barrier(); /* unmask then check (avoid races) */ \
+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
+ force_evtchn_callback(); \
+} while (0)
+
+#define __restore_flags(x) \
+do { \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
+ barrier(); /* unmask then check (avoid races) */ \
+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
+ force_evtchn_callback(); \
+ } \
+} while (0)
+
+/*
+ * Add critical_{enter, exit}?
+ *
+ */
+#define __save_and_cli(x) \
+do { \
+ vcpu_info_t *_vcpu; \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ (x) = _vcpu->evtchn_upcall_mask; \
+ _vcpu->evtchn_upcall_mask = 1; \
+ barrier(); \
+} while (0)
+
+
+#define cli() __cli()
+#define sti() __sti()
+#define save_flags(x) __save_flags(x)
+#define restore_flags(x) __restore_flags(x)
+#define save_and_cli(x) __save_and_cli(x)
+
+#define local_irq_save(x) __save_and_cli(x)
+#define local_irq_restore(x) __restore_flags(x)
+#define local_irq_disable() __cli()
+#define local_irq_enable() __sti()
+
+#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));}
+#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); }
+#define spin_lock_irqsave mtx_lock_irqsave
+#define spin_unlock_irqrestore mtx_unlock_irqrestore
+
+#else
+#endif
+
+#ifndef mb
+#define mb() __asm__ __volatile__("mfence":::"memory")
+#endif
+#ifndef rmb
+#define rmb() __asm__ __volatile__("lfence":::"memory");
+#endif
+#ifndef wmb
+#define wmb() barrier()
+#endif
+#ifdef SMP
+#define smp_mb() mb()
+#define smp_rmb() rmb()
+#define smp_wmb() wmb()
+#define smp_read_barrier_depends() read_barrier_depends()
+#define set_mb(var, value) do { xchg(&var, value); } while (0)
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_read_barrier_depends() do { } while(0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+
+/* This is a barrier for the compiler only, NOT the processor! */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+#define LOCK_PREFIX ""
+#define LOCK ""
+#define ADDR (*(volatile long *) addr)
+/*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+ * not some alias that contains the same information.
+ */
+typedef struct { volatile int counter; } atomic_t;
+
+
+
+#define xen_xchg(ptr,v) \
+ ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
+struct __xchg_dummy { unsigned long a[100]; };
+#define __xg(x) ((volatile struct __xchg_dummy *)(x))
+static __inline unsigned long __xchg(unsigned long x, volatile void * ptr,
+ int size)
+{
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("xchgb %b0,%1"
+ :"=q" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 2:
+ __asm__ __volatile__("xchgw %w0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 4:
+ __asm__ __volatile__("xchgl %0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ }
+ return x;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static __inline int test_and_clear_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__( LOCK_PREFIX
+ "btrl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit),"=m" (ADDR)
+ :"Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline int constant_test_bit(int nr, const volatile void * addr)
+{
+ return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+}
+
+static __inline int variable_test_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__(
+ "btl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit)
+ :"m" (ADDR),"Ir" (nr));
+ return oldbit;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__( LOCK_PREFIX
+ "btsl %1,%0"
+ :"=m" (ADDR)
+ :"Ir" (nr));
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static __inline__ void clear_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__( LOCK_PREFIX
+ "btrl %1,%0"
+ :"=m" (ADDR)
+ :"Ir" (nr));
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1. Note that the guaranteed
+ * useful range of an atomic_t is only 24 bits.
+ */
+static __inline__ void atomic_inc(atomic_t *v)
+{
+ __asm__ __volatile__(
+ LOCK "incl %0"
+ :"=m" (v->counter)
+ :"m" (v->counter));
+}
+
+
+#define rdtscll(val) \
+ __asm__ __volatile__("rdtsc" : "=A" (val))
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _OS_H_ */
diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h
new file mode 100644
index 000000000000..b3a6672576cb
--- /dev/null
+++ b/sys/amd64/include/xen/xenfunc.h
@@ -0,0 +1,83 @@
+/*
+ *
+ * Copyright (c) 2004,2005 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENFUNC_H_
+#define _XEN_XENFUNC_H_
+
+#ifdef XENHVM
+#include <machine/xen/xenvar.h>
+#else
+#include <machine/xen/xenpmap.h>
+#include <machine/segments.h>
+#endif
+
+#define BKPT __asm__("int3");
+#define XPQ_CALL_DEPTH 5
+#define XPQ_CALL_COUNT 2
+#define PG_PRIV PG_AVAIL3
+typedef struct {
+ unsigned long pt_ref;
+ unsigned long pt_eip[XPQ_CALL_COUNT][XPQ_CALL_DEPTH];
+} pteinfo_t;
+
+extern pteinfo_t *pteinfo_list;
+#ifdef XENDEBUG_LOW
+#define __PRINTK(x) printk x
+#else
+#define __PRINTK(x)
+#endif
+
+char *xen_setbootenv(char *cmd_line);
+
+int xen_boothowto(char *envp);
+
+void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line);
+
+#ifdef INVARIANTS
+#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), __FILE__, __LINE__)
+#else
+#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0)
+#endif
+
+#ifndef XENHVM
+void xen_update_descriptor(union descriptor *, union descriptor *);
+#endif
+
+extern struct mtx balloon_lock;
+#if 0
+#define balloon_lock(__flags) mtx_lock_irqsave(&balloon_lock, __flags)
+#define balloon_unlock(__flags) mtx_unlock_irqrestore(&balloon_lock, __flags)
+#else
+#define balloon_lock(__flags) __flags = 1
+#define balloon_unlock(__flags) __flags = 0
+#endif
+
+
+
+#endif /* _XEN_XENFUNC_H_ */
diff --git a/sys/amd64/include/xen/xenpmap.h b/sys/amd64/include/xen/xenpmap.h
new file mode 100644
index 000000000000..d768dad5f311
--- /dev/null
+++ b/sys/amd64/include/xen/xenpmap.h
@@ -0,0 +1,227 @@
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004,2005 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENPMAP_H_
+#define _XEN_XENPMAP_H_
+
+#include <machine/xen/features.h>
+
+void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int);
+void xen_pt_switch(vm_paddr_t);
+void xen_set_ldt(vm_paddr_t, unsigned long);
+void xen_pgdpt_pin(vm_paddr_t);
+void xen_pgd_pin(vm_paddr_t);
+void xen_pgd_unpin(vm_paddr_t);
+void xen_pt_pin(vm_paddr_t);
+void xen_pt_unpin(vm_paddr_t);
+void xen_flush_queue(void);
+void xen_check_queue(void);
+#if 0
+void pmap_ref(pt_entry_t *pte, vm_paddr_t ma);
+#endif
+
+#ifdef INVARIANTS
+#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__)
+#else
+#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0)
+#endif
+
+#ifdef PMAP_DEBUG
+#define PMAP_REF pmap_ref
+#define PMAP_DEC_REF_PAGE pmap_dec_ref_page
+#define PMAP_MARK_PRIV pmap_mark_privileged
+#define PMAP_MARK_UNPRIV pmap_mark_unprivileged
+#else
+#define PMAP_MARK_PRIV(a)
+#define PMAP_MARK_UNPRIV(a)
+#define PMAP_REF(a, b)
+#define PMAP_DEC_REF_PAGE(a)
+#endif
+
+#define ALWAYS_SYNC 0
+
+#ifdef PT_DEBUG
+#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__)
+#else
+#define PT_LOG()
+#endif
+
+#define INVALID_P2M_ENTRY (~0UL)
+
+#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */
+
+#define SH_PD_SET_VA 1
+#define SH_PD_SET_VA_MA 2
+#define SH_PD_SET_VA_CLEAR 3
+
+struct pmap;
+void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type);
+#ifdef notyet
+static vm_paddr_t
+vptetomachpte(vm_paddr_t *pte)
+{
+ vm_offset_t offset, ppte;
+ vm_paddr_t pgoffset, retval, *pdir_shadow_ptr;
+ int pgindex;
+
+ ppte = (vm_offset_t)pte;
+ pgoffset = (ppte & PAGE_MASK);
+ offset = ppte - (vm_offset_t)PTmap;
+ pgindex = ppte >> PDRSHIFT;
+
+ pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow);
+ retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset;
+ return (retval);
+}
+#endif
+#define PT_GET(_ptp) \
+ (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0))
+
+#ifdef WRITABLE_PAGETABLES
+
+#define PT_SET_VA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ PT_LOG(); \
+ *(_ptp) = xpmap_ptom((_npte)); \
+} while (/*CONSTCOND*/0)
+#define PT_SET_VA_MA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ PT_LOG(); \
+ *(_ptp) = (_npte); \
+} while (/*CONSTCOND*/0)
+#define PT_CLEAR_VA(_ptp, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ PT_LOG(); \
+ *(_ptp) = 0; \
+} while (/*CONSTCOND*/0)
+
+#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#else /* !WRITABLE_PAGETABLES */
+
+#define PT_SET_VA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ xen_queue_pt_update(vtomach(_ptp), \
+ xpmap_ptom(_npte)); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PT_SET_VA_MA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ xen_queue_pt_update(vtomach(_ptp), _npte); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PT_CLEAR_VA(_ptp, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ xen_queue_pt_update(vtomach(_ptp), 0); \
+ if (sync || ALWAYS_SYNC) \
+ xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#endif
+
+#define PT_SET_MA(_va, _ma) \
+do { \
+ PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\
+ (_ma), \
+ UVMF_INVLPG| UVMF_ALL) < 0); \
+} while (/*CONSTCOND*/0)
+
+#define PT_UPDATES_FLUSH() do { \
+ xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+static __inline vm_paddr_t
+xpmap_mtop(vm_paddr_t mpa)
+{
+ vm_paddr_t tmp = (mpa & PG_FRAME);
+
+ return machtophys(tmp) | (mpa & ~PG_FRAME);
+}
+
+static __inline vm_paddr_t
+xpmap_ptom(vm_paddr_t ppa)
+{
+ vm_paddr_t tmp = (ppa & PG_FRAME);
+
+ return phystomach(tmp) | (ppa & ~PG_FRAME);
+}
+
+static __inline void
+set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+#ifdef notyet
+ PANIC_IF(max_mapnr && pfn >= max_mapnr);
+#endif
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef notyet
+ PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY));
+#endif
+ return;
+ }
+ xen_phys_machine[pfn] = mfn;
+}
+
+
+
+
+#endif /* _XEN_XENPMAP_H_ */
diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h
new file mode 100644
index 000000000000..1433b76871ec
--- /dev/null
+++ b/sys/amd64/include/xen/xenvar.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2008 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * $FreeBSD$
+ */
+#ifndef XENVAR_H_
+#define XENVAR_H_
+#define XBOOTUP 0x1
+#define XPMAP 0x2
+extern int xendebug_flags;
+#ifndef NOXENDEBUG
+#define XENPRINTF printk
+#else
+#define XENPRINTF printf
+#endif
+#include <xen/features.h>
+
+#if 0
+#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__)
+#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__)
+#define TRACE_DEBUG(argflags, _f, _a...) \
+if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a);
+#else
+#define TRACE_ENTER
+#define TRACE_EXIT
+#define TRACE_DEBUG(argflags, _f, _a...)
+#endif
+
+#ifdef XENHVM
+
+static inline vm_paddr_t
+phystomach(vm_paddr_t pa)
+{
+
+ return (pa);
+}
+
+static inline vm_paddr_t
+machtophys(vm_paddr_t ma)
+{
+
+ return (ma);
+}
+
+#define vtomach(va) pmap_kextract((vm_offset_t) (va))
+#define PFNTOMFN(pa) (pa)
+#define MFNTOPFN(ma) (ma)
+
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) (TRUE)
+#define PT_UPDATES_FLUSH() ((void)0)
+
+#else
+
+extern xen_pfn_t *xen_phys_machine;
+
+
+extern xen_pfn_t *xen_machine_phys;
+/* Xen starts physical pages after the 4MB ISA hole -
+ * FreeBSD doesn't
+ */
+
+
+#undef ADD_ISA_HOLE /* XXX */
+
+#ifdef ADD_ISA_HOLE
+#define ISA_INDEX_OFFSET 1024
+#define ISA_PDR_OFFSET 1
+#else
+#define ISA_INDEX_OFFSET 0
+#define ISA_PDR_OFFSET 0
+#endif
+
+
+#define PFNTOMFN(i) (xen_phys_machine[(i)])
+#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)])
+
+#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE)
+#define PTOV(x) (((uintptr_t)(x)) + KERNBASE)
+
+#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT)
+#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT)
+
+#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT)
+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
+
+#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT)
+#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT)
+
+#endif
+
+void xpq_init(void);
+
+int xen_create_contiguous_region(vm_page_t pages, int npages);
+
+void xen_destroy_contiguous_region(void * addr, int npages);
+
+#endif
diff --git a/sys/conf/files b/sys/conf/files
index f3e90aa77729..6a14ef9fbbf1 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2758,21 +2758,24 @@ gnu/fs/xfs/xfs_iomap.c optional xfs \
gnu/fs/xfs/xfs_behavior.c optional xfs \
compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs"
-xen/gnttab.c optional xen
-xen/features.c optional xen
-xen/evtchn/evtchn.c optional xen
-xen/evtchn/evtchn_dev.c optional xen
-xen/xenbus/xenbus_client.c optional xen
-xen/xenbus/xenbus_comms.c optional xen
-xen/xenbus/xenbus_dev.c optional xen
-xen/xenbus/xenbus_if.m optional xen
-xen/xenbus/xenbus_probe.c optional xen
-#xen/xenbus/xenbus_probe_backend.c optional xen
-xen/xenbus/xenbus_xs.c optional xen
-dev/xen/console/console.c optional xen
-dev/xen/console/xencons_ring.c optional xen
-dev/xen/blkfront/blkfront.c optional xen
-dev/xen/netfront/netfront.c optional xen
-#dev/xen/xenpci/xenpci.c optional xen
-#xen/xenbus/xenbus_newbus.c optional xenhvm
+xen/gnttab.c optional xen | xenhvm
+xen/features.c optional xen | xenhvm
+xen/evtchn/evtchn.c optional xen
+xen/evtchn/evtchn_dev.c optional xen | xenhvm
+xen/reboot.c optional xen
+xen/xenbus/xenbus_client.c optional xen | xenhvm
+xen/xenbus/xenbus_comms.c optional xen | xenhvm
+xen/xenbus/xenbus_dev.c optional xen | xenhvm
+xen/xenbus/xenbus_if.m optional xen | xenhvm
+xen/xenbus/xenbus_probe.c optional xen | xenhvm
+#xen/xenbus/xenbus_probe_backend.c optional xen
+xen/xenbus/xenbus_xs.c optional xen | xenhvm
+dev/xen/balloon/balloon.c optional xen | xenhvm
+dev/xen/console/console.c optional xen
+dev/xen/console/xencons_ring.c optional xen
+dev/xen/blkfront/blkfront.c optional xen | xenhvm
+dev/xen/netfront/netfront.c optional xen | xenhvm
+dev/xen/xenpci/xenpci.c optional xenpci
+dev/xen/xenpci/evtchn.c optional xenpci
+dev/xen/xenpci/machine_reboot.c optional xenpci
diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64
index 1e693632ef43..5247921eb8ac 100644
--- a/sys/conf/options.amd64
+++ b/sys/conf/options.amd64
@@ -57,3 +57,5 @@ KDTRACE_FRAME opt_kdtrace.h
# BPF just-in-time compiler
BPF_JITTER opt_bpf.h
+
+XENHVM opt_global.h
diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c
index fa49196bc02e..c23433cbebcd 100644
--- a/sys/dev/xen/balloon/balloon.c
+++ b/sys/dev/xen/balloon/balloon.c
@@ -34,11 +34,24 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/sysctl.h>
-#include <machine/hypervisor-ifs.h>
-#include <machine/xen-os.h>
-#include <machine/xenbus.h>
+#include <machine/xen/xen-os.h>
+#include <machine/xen/xenfunc.h>
+#include <machine/xen/xenvar.h>
+#include <xen/hypervisor.h>
+#include <xen/xenbus/xenbusvar.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver");
+
+struct mtx balloon_mutex;
/*
* Protects atomic reservation decrease/increase against concurrent increases.
@@ -46,23 +59,44 @@ __FBSDID("$FreeBSD$");
* balloon lists.
*/
struct mtx balloon_lock;
-#ifdef notyet
-
-/* We aim for 'current allocation' == 'target allocation'. */
-static unsigned long current_pages;
-static unsigned long target_pages;
-/* VM /proc information for memory */
-extern unsigned long totalram_pages;
+/* We increase/decrease in batches which fit in a page */
+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+#define ARRAY_SIZE(A) (sizeof(A) / sizeof(A[0]))
+
+struct balloon_stats {
+ /* We aim for 'current allocation' == 'target allocation'. */
+ unsigned long current_pages;
+ unsigned long target_pages;
+ /* We may hit the hard limit in Xen. If we do then we remember it. */
+ unsigned long hard_limit;
+ /*
+ * Drivers may alter the memory reservation independently, but they
+ * must inform the balloon driver so we avoid hitting the hard limit.
+ */
+ unsigned long driver_pages;
+ /* Number of pages in high- and low-memory balloons. */
+ unsigned long balloon_low;
+ unsigned long balloon_high;
+};
-/* We may hit the hard limit in Xen. If we do then we remember it. */
-static unsigned long hard_limit;
-
-/*
- * Drivers may alter the memory reservation independently, but they must
- * inform the balloon driver so that we can avoid hitting the hard limit.
- */
-static unsigned long driver_pages;
+static struct balloon_stats balloon_stats;
+#define bs balloon_stats
+
+SYSCTL_DECL(_dev_xen);
+SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon");
+SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD,
+ &bs.current_pages, 0, "Current allocation");
+SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD,
+ &bs.target_pages, 0, "Target allocation");
+SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD,
+ &bs.driver_pages, 0, "Driver pages");
+SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD,
+ &bs.hard_limit, 0, "Xen hard limit");
+SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD,
+ &bs.balloon_low, 0, "Low-mem balloon");
+SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD,
+ &bs.balloon_high, 0, "High-mem balloon");
struct balloon_entry {
vm_page_t page;
@@ -72,9 +106,6 @@ struct balloon_entry {
/* List of ballooned pages, threaded through the mem_map array. */
static STAILQ_HEAD(,balloon_entry) ballooned_pages;
-static unsigned long balloon_low, balloon_high;
-
-
/* Main work function, always executed in process context. */
static void balloon_process(void *unused);
@@ -89,10 +120,10 @@ balloon_append(vm_page_t page)
{
struct balloon_entry *entry;
- entry = malloc(sizeof(struct balloon_entry), M_WAITOK);
-
+ entry = malloc(sizeof(struct balloon_entry), M_BALLOON, M_WAITOK);
+ entry->page = page;
STAILQ_INSERT_HEAD(&ballooned_pages, entry, list);
- balloon_low++;
+ bs.balloon_low++;
}
/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
@@ -111,13 +142,13 @@ balloon_retrieve(void)
page = entry->page;
free(entry, M_DEVBUF);
- balloon_low--;
+ bs.balloon_low--;
return page;
}
static void
-balloon_alarm(unsigned long unused)
+balloon_alarm(void *unused)
{
wakeup(balloon_process);
}
@@ -125,17 +156,56 @@ balloon_alarm(unsigned long unused)
static unsigned long
current_target(void)
{
- unsigned long target = min(target_pages, hard_limit);
- if (target > (current_pages + balloon_low + balloon_high))
- target = current_pages + balloon_low + balloon_high;
+ unsigned long target = min(bs.target_pages, bs.hard_limit);
+ if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
+ target = bs.current_pages + bs.balloon_low + bs.balloon_high;
return target;
}
+static unsigned long
+minimum_target(void)
+{
+#ifdef XENHVM
+#define max_pfn physmem
+#endif
+ unsigned long min_pages, curr_pages = current_target();
+
+#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+ /* Simple continuous piecewiese linear function:
+ * max MiB -> min MiB gradient
+ * 0 0
+ * 16 16
+ * 32 24
+ * 128 72 (1/2)
+ * 512 168 (1/4)
+ * 2048 360 (1/8)
+ * 8192 552 (1/32)
+ * 32768 1320
+ * 131072 4392
+ */
+ if (max_pfn < MB2PAGES(128))
+ min_pages = MB2PAGES(8) + (max_pfn >> 1);
+ else if (max_pfn < MB2PAGES(512))
+ min_pages = MB2PAGES(40) + (max_pfn >> 2);
+ else if (max_pfn < MB2PAGES(2048))
+ min_pages = MB2PAGES(104) + (max_pfn >> 3);
+ else
+ min_pages = MB2PAGES(296) + (max_pfn >> 5);
+#undef MB2PAGES
+
+ /* Don't enforce growth */
+ return min(min_pages, curr_pages);
+#ifndef CONFIG_XEN
+#undef max_pfn
+#endif
+}
+
static int
increase_reservation(unsigned long nr_pages)
{
- unsigned long *mfn_list, pfn, i, flags;
- struct page *page;
+ unsigned long pfn, i;
+ struct balloon_entry *entry;
+ vm_page_t page;
long rc;
struct xen_memory_reservation reservation = {
.address_bits = 0,
@@ -143,64 +213,81 @@ increase_reservation(unsigned long nr_pages)
.domid = DOMID_SELF
};
- if (nr_pages > (PAGE_SIZE / sizeof(unsigned long)))
- nr_pages = PAGE_SIZE / sizeof(unsigned long);
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
- mfn_list = (unsigned long *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
- if (mfn_list == NULL)
- return ENOMEM;
+ mtx_lock(&balloon_lock);
+ for (entry = STAILQ_FIRST(&ballooned_pages), i = 0;
+ i < nr_pages; i++, entry = STAILQ_NEXT(entry, list)) {
+ KASSERT(entry, ("ballooned_pages list corrupt"));
+ page = entry->page;
+ frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
+ }
- reservation.extent_start = mfn_list;
+ set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
rc = HYPERVISOR_memory_op(
- XENMEM_increase_reservation, &reservation);
+ XENMEM_populate_physmap, &reservation);
if (rc < nr_pages) {
- int ret;
- /* We hit the Xen hard limit: reprobe. */
- reservation.extent_start = mfn_list;
- reservation.nr_extents = rc;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- PANIC_IF(ret != rc);
- hard_limit = current_pages + rc - driver_pages;
+ if (rc > 0) {
+ int ret;
+
+ /* We hit the Xen hard limit: reprobe. */
+ reservation.nr_extents = rc;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ KASSERT(ret == rc, ("HYPERVISOR_memory_op failed"));
+ }
+ if (rc >= 0)
+ bs.hard_limit = (bs.current_pages + rc -
+ bs.driver_pages);
goto out;
}
for (i = 0; i < nr_pages; i++) {
page = balloon_retrieve();
- PANIC_IF(page == NULL);
+ KASSERT(page, ("balloon_retrieve failed"));
pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
- PANIC_IF(phys_to_machine_mapping_valid(pfn));
+ KASSERT((xen_feature(XENFEAT_auto_translated_physmap) ||
+ !phys_to_machine_mapping_valid(pfn)),
+ ("auto translated physmap but mapping is valid"));
+
+ set_phys_to_machine(pfn, frame_list[i]);
+
+#ifndef XENHVM
+ /* Link back into the page tables if not highmem. */
+ if (pfn < max_low_pfn) {
+ int ret;
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte_ma(frame_list[i], PAGE_KERNEL),
+ 0);
+ PASSING(ret == 0,
+ ("HYPERVISOR_update_va_mapping failed"));
+ }
+#endif
- /* Update P->M and M->P tables. */
- PFNTOMFN(pfn) = mfn_list[i];
- xen_machphys_update(mfn_list[i], pfn);
-
/* Relinquish the page back to the allocator. */
- ClearPageReserved(page);
- set_page_count(page, 1);
+ vm_page_unwire(page, 0);
vm_page_free(page);
}
- current_pages += nr_pages;
- totalram_pages = current_pages;
+ bs.current_pages += nr_pages;
+ //totalram_pages = bs.current_pages;
out:
- balloon_unlock(flags);
-
- free((mfn_list);
+ mtx_unlock(&balloon_lock);
return 0;
}
-static int
+static int
decrease_reservation(unsigned long nr_pages)
{
- unsigned long *mfn_list, pfn, i, flags;
- struct page *page;
- void *v;
+ unsigned long pfn, i;
+ vm_page_t page;
int need_sleep = 0;
int ret;
struct xen_memory_reservation reservation = {
@@ -209,48 +296,68 @@ decrease_reservation(unsigned long nr_pages)
.domid = DOMID_SELF
};
- if (nr_pages > (PAGE_SIZE / sizeof(unsigned long)))
- nr_pages = PAGE_SIZE / sizeof(unsigned long);
-
- mfn_list = (unsigned long *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
- if (mfn_list == NULL)
- return ENOMEM;
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
for (i = 0; i < nr_pages; i++) {
int color = 0;
if ((page = vm_page_alloc(NULL, color++,
- VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
- VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
+ VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
nr_pages = i;
need_sleep = 1;
break;
}
+
pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
- mfn_list[i] = PFNTOMFN(pfn);
+ frame_list[i] = PFNTOMFN(pfn);
+
+#if 0
+ if (!PageHighMem(page)) {
+ v = phys_to_virt(pfn << PAGE_SHIFT);
+ scrub_pages(v, 1);
+#ifdef CONFIG_XEN
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)v, __pte_ma(0), 0);
+ BUG_ON(ret);
+#endif
+ }
+#endif
+#ifdef CONFIG_XEN_SCRUB_PAGES
+ else {
+ v = kmap(page);
+ scrub_pages(v, 1);
+ kunmap(page);
+ }
+#endif
}
- balloon_lock(flags);
+#ifdef CONFIG_XEN
+ /* Ensure that ballooned highmem pages don't have kmaps. */
+ kmap_flush_unused();
+ flush_tlb_all();
+#endif
+
+ mtx_lock(&balloon_lock);
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
- pfn = MFNTOPFN(mfn_list[i]);
- PFNTOMFN(pfn) = INVALID_P2M_ENTRY;
+ pfn = MFNTOPFN(frame_list[i]);
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
balloon_append(PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT));
}
- reservation.extent_start = mfn_list;
+ set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
- PANIC_IF(ret != nr_pages);
-
- current_pages -= nr_pages;
- totalram_pages = current_pages;
+ KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed"));
- balloon_unlock(flags);
+ bs.current_pages -= nr_pages;
+ //totalram_pages = bs.current_pages;
- free(mfn_list, M_DEVBUF);
+ mtx_unlock(&balloon_lock);
- return need_sleep;
+ return (need_sleep);
}
/*
@@ -265,27 +372,24 @@ balloon_process(void *unused)
int need_sleep = 0;
long credit;
+ mtx_lock(&balloon_mutex);
for (;;) {
do {
- credit = current_target() - current_pages;
+ credit = current_target() - bs.current_pages;
if (credit > 0)
need_sleep = (increase_reservation(credit) != 0);
if (credit < 0)
need_sleep = (decrease_reservation(-credit) != 0);
-#ifndef CONFIG_PREEMPT
- if (need_resched())
- schedule();
-#endif
} while ((credit != 0) && !need_sleep);
/* Schedule more work if there is some still to be done. */
- if (current_target() != current_pages)
- timeout(balloon_alarm, NULL, ticks + HZ);
+ if (current_target() != bs.current_pages)
+ timeout(balloon_alarm, NULL, ticks + hz);
- msleep(balloon_process, balloon_lock, 0, "balloon", -1);
+ msleep(balloon_process, &balloon_mutex, 0, "balloon", -1);
}
-
+ mtx_unlock(&balloon_mutex);
}
/* Resets the Xen limit, sets new target, and kicks off processing. */
@@ -293,8 +397,8 @@ static void
set_new_target(unsigned long target)
{
/* No need for lock. Not read-modify-write updates. */
- hard_limit = ~0UL;
- target_pages = target;
+ bs.hard_limit = ~0UL;
+ bs.target_pages = max(target, minimum_target());
wakeup(balloon_process);
}
@@ -311,8 +415,9 @@ watch_target(struct xenbus_watch *watch,
unsigned long long new_target;
int err;
- err = xenbus_scanf(NULL, "memory", "target", "%llu", &new_target);
- if (err != 1) {
+ err = xenbus_scanf(XBT_NIL, "memory", "target", NULL,
+ "%llu", &new_target);
+ if (err) {
/* This is ok (for domain0 at least) - so just return */
return;
}
@@ -325,7 +430,7 @@ watch_target(struct xenbus_watch *watch,
}
static void
-balloon_init_watcher(void *)
+balloon_init_watcher(void *arg)
{
int err;
@@ -334,48 +439,60 @@ balloon_init_watcher(void *)
printf("Failed to set balloon watcher\n");
}
+SYSINIT(balloon_init_watcher, SI_SUB_PSEUDO, SI_ORDER_ANY,
+ balloon_init_watcher, NULL);
static void
-balloon_init(void *)
+balloon_init(void *arg)
{
- unsigned long pfn;
- struct page *page;
-
- IPRINTK("Initialising balloon driver.\n");
+#ifndef XENHVM
+ vm_page_t page;
+#endif
- if (xen_init() < 0)
- return -1;
+ if (!is_running_on_xen())
+ return;
- current_pages = min(xen_start_info->nr_pages, max_pfn);
- target_pages = current_pages;
- balloon_low = 0;
- balloon_high = 0;
- driver_pages = 0UL;
- hard_limit = ~0UL;
+ mtx_init(&balloon_lock, "balloon_lock", NULL, MTX_DEF);
+ mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF);
- init_timer(&balloon_timer);
- balloon_timer.data = 0;
- balloon_timer.function = balloon_alarm;
+#ifndef XENHVM
+ bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
+#else
+ bs.current_pages = physmem;
+#endif
+ bs.target_pages = bs.current_pages;
+ bs.balloon_low = 0;
+ bs.balloon_high = 0;
+ bs.driver_pages = 0UL;
+ bs.hard_limit = ~0UL;
+
+ kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon");
+// init_timer(&balloon_timer);
+// balloon_timer.data = 0;
+// balloon_timer.function = balloon_alarm;
+#ifndef XENHVM
/* Initialise the balloon with excess memory space. */
for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT);
balloon_append(page);
}
+#endif
target_watch.callback = watch_target;
- return 0;
+ return;
}
+SYSINIT(balloon_init, SI_SUB_PSEUDO, SI_ORDER_ANY, balloon_init, NULL);
+
+void balloon_update_driver_allowance(long delta);
void
balloon_update_driver_allowance(long delta)
{
- unsigned long flags;
-
- balloon_lock(flags);
- driver_pages += delta;
- balloon_unlock(flags);
+ mtx_lock(&balloon_lock);
+ bs.driver_pages += delta;
+ mtx_unlock(&balloon_lock);
}
#if 0
@@ -393,17 +510,18 @@ static int dealloc_pte_fn(
set_pte_at(&init_mm, addr, pte, __pte_ma(0));
set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
- PANIC_IF(ret != 1);
+ KASSERT(ret == 1, ("HYPERVISOR_memory_op failed"));
return 0;
}
#endif
+
+#if 0
vm_page_t
balloon_alloc_empty_page_range(unsigned long nr_pages)
{
- unsigned long flags;
vm_page_t pages;
- int i;
+ int i, rc;
unsigned long *mfn_list;
struct xen_memory_reservation reservation = {
.address_bits = 0,
@@ -422,7 +540,9 @@ balloon_alloc_empty_page_range(unsigned long nr_pages)
PFNTOMFN(i) = INVALID_P2M_ENTRY;
reservation.extent_start = mfn_list;
reservation.nr_extents = nr_pages;
- PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != nr_pages);
+ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ KASSERT(rc == nr_pages, ("HYPERVISOR_memory_op failed"));
}
current_pages -= nr_pages;
@@ -435,12 +555,11 @@ balloon_alloc_empty_page_range(unsigned long nr_pages)
void
balloon_dealloc_empty_page_range(vm_page_t page, unsigned long nr_pages)
{
- unsigned long i, flags;
+ unsigned long i;
for (i = 0; i < nr_pages; i++)
balloon_append(page + i);
wakeup(balloon_process);
}
-
#endif
diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c
index fdebc9d34b2c..01493a6a8595 100644
--- a/sys/dev/xen/blkfront/blkfront.c
+++ b/sys/dev/xen/blkfront/blkfront.c
@@ -40,17 +40,17 @@ __FBSDID("$FreeBSD$");
#include <machine/intr_machdep.h>
#include <machine/vmparam.h>
-#include <xen/hypervisor.h>
#include <machine/xen/xen-os.h>
+#include <machine/xen/xenfunc.h>
+#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <xen/evtchn.h>
+#include <xen/gnttab.h>
#include <xen/interface/grant_table.h>
#include <xen/interface/io/protocols.h>
#include <xen/xenbus/xenbusvar.h>
#include <geom/geom_disk.h>
-#include <machine/xen/xenfunc.h>
-#include <xen/gnttab.h>
#include <dev/xen/blkfront/block.h>
@@ -106,7 +106,7 @@ static char * blkif_status_name[] = {
#endif
#define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args)
#if 0
-#define DPRINTK(fmt, args...) printf("[XEN] %s:%d" fmt ".\n", __FUNCTION__, __LINE__,##args)
+#define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
#else
#define DPRINTK(fmt, args...)
#endif
@@ -138,7 +138,6 @@ pfn_to_mfn(vm_paddr_t pfn)
return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT);
}
-
/*
* Translate Linux major/minor to an appropriate name and unit
* number. For HVM guests, this allows us to use the same drive names
@@ -323,17 +322,17 @@ blkfront_probe(device_t dev)
static int
blkfront_attach(device_t dev)
{
- int err, vdevice, i, unit;
+ int error, vdevice, i, unit;
struct blkfront_info *info;
const char *name;
/* FIXME: Use dynamic device id if this is not set. */
- err = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
+ error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
"virtual-device", NULL, "%i", &vdevice);
- if (err) {
- xenbus_dev_fatal(dev, err, "reading virtual-device");
+ if (error) {
+ xenbus_dev_fatal(dev, error, "reading virtual-device");
printf("couldn't find virtual device");
- return (err);
+ return (error);
}
blkfront_vdevice_to_unit(vdevice, &unit, &name);
@@ -362,9 +361,22 @@ blkfront_attach(device_t dev)
/* Front end dir is a number, which is used as the id. */
info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
- err = talk_to_backend(dev, info);
- if (err)
- return (err);
+ error = talk_to_backend(dev, info);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static int
+blkfront_suspend(device_t dev)
+{
+ struct blkfront_info *info = device_get_softc(dev);
+
+ /* Prevent new requests being issued until we fix things up. */
+ mtx_lock(&blkif_io_lock);
+ info->connected = BLKIF_STATE_SUSPENDED;
+ mtx_unlock(&blkif_io_lock);
return (0);
}
@@ -375,16 +387,14 @@ blkfront_resume(device_t dev)
struct blkfront_info *info = device_get_softc(dev);
int err;
- DPRINTK("blkfront_resume: %s\n", dev->nodename);
+ DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
blkif_free(info, 1);
-
err = talk_to_backend(dev, info);
-
if (info->connected == BLKIF_STATE_SUSPENDED && !err)
blkif_recover(info);
- return err;
+ return (err);
}
/* Common code used when first setting up, and when resuming. */
@@ -425,6 +435,7 @@ talk_to_backend(device_t dev, struct blkfront_info *info)
message = "writing protocol";
goto abort_transaction;
}
+
err = xenbus_transaction_end(xbt, 0);
if (err) {
if (err == EAGAIN)
@@ -462,8 +473,8 @@ setup_blkring(device_t dev, struct blkfront_info *info)
SHARED_RING_INIT(sring);
FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
- error = xenbus_grant_ring(dev, (vtomach(info->ring.sring) >> PAGE_SHIFT),
- &info->ring_ref);
+ error = xenbus_grant_ring(dev,
+ (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref);
if (error) {
free(sring, M_DEVBUF);
info->ring.sring = NULL;
@@ -471,11 +482,11 @@ setup_blkring(device_t dev, struct blkfront_info *info)
}
error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev),
- "xbd", (driver_intr_t *)blkif_int, info,
- INTR_TYPE_BIO | INTR_MPSAFE, &info->irq);
+ "xbd", (driver_intr_t *)blkif_int, info,
+ INTR_TYPE_BIO | INTR_MPSAFE, &info->irq);
if (error) {
xenbus_dev_fatal(dev, error,
- "bind_evtchn_to_irqhandler failed");
+ "bind_evtchn_to_irqhandler failed");
goto fail;
}
@@ -494,7 +505,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
{
struct blkfront_info *info = device_get_softc(dev);
- DPRINTK("blkfront:backend_changed.\n");
+ DPRINTK("backend_state=%d\n", backend_state);
switch (backend_state) {
case XenbusStateUnknown:
@@ -707,7 +718,7 @@ blkif_open(struct disk *dp)
struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
if (sc == NULL) {
- printk("xb%d: not found", sc->xb_unit);
+ printf("xb%d: not found", sc->xb_unit);
return (ENXIO);
}
@@ -1019,9 +1030,11 @@ blkif_recover(struct blkfront_info *info)
blkif_request_t *req;
struct blk_shadow *copy;
+ if (!info->sc)
+ return;
+
/* Stage 1: Make a safe copy of the shadow state. */
copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO);
- PANIC_IF(copy == NULL);
memcpy(copy, info->shadow, sizeof(info->shadow));
/* Stage 2: Set up free list. */
@@ -1084,7 +1097,7 @@ static device_method_t blkfront_methods[] = {
DEVMETHOD(device_attach, blkfront_attach),
DEVMETHOD(device_detach, blkfront_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
- DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_suspend, blkfront_suspend),
DEVMETHOD(device_resume, blkfront_resume),
/* Xenbus interface */
diff --git a/sys/dev/xen/console/console.c b/sys/dev/xen/console/console.c
index a3d616a74856..0634dadada30 100644
--- a/sys/dev/xen/console/console.c
+++ b/sys/dev/xen/console/console.c
@@ -5,6 +5,7 @@ __FBSDID("$FreeBSD$");
#include <sys/module.h>
#include <sys/systm.h>
#include <sys/consio.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/tty.h>
@@ -18,7 +19,7 @@ __FBSDID("$FreeBSD$");
#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <sys/cons.h>
-#include <sys/priv.h>
+#include <sys/kdb.h>
#include <sys/proc.h>
#include <dev/xen/console/xencons_ring.h>
@@ -125,12 +126,17 @@ xccngetc(struct consdev *dev)
return 0;
do {
if ((c = xccncheckc(dev)) == -1) {
- /* polling without sleeping in Xen doesn't work well.
- * Sleeping gives other things like clock a chance to
- * run
- */
- tsleep(&cn_mtx, PWAIT | PCATCH, "console sleep",
- XC_POLLTIME);
+#ifdef KDB
+ if (!kdb_active)
+#endif
+ /*
+ * Polling without sleeping in Xen
+ * doesn't work well. Sleeping gives
+ * other things like clock a chance to
+ * run
+ */
+ tsleep(&cn_mtx, PWAIT | PCATCH,
+ "console sleep", XC_POLLTIME);
}
} while(c == -1);
return c;
@@ -140,11 +146,13 @@ int
xccncheckc(struct consdev *dev)
{
int ret = (xc_mute ? 0 : -1);
- if (xencons_has_input())
- xencons_handle_input(NULL);
+
+ if (xencons_has_input())
+ xencons_handle_input(NULL);
CN_LOCK(cn_mtx);
if ((rp - rc)) {
+ if (kdb_active) printf("%s:%d\n", __func__, __LINE__);
/* we need to return only one char */
ret = (int)rbuf[RBUF_MASK(rc)];
rc++;
@@ -235,17 +243,16 @@ xc_attach(device_t dev)
if (xen_start_info->flags & SIF_INITDOMAIN) {
error = bind_virq_to_irqhandler(
- VIRQ_CONSOLE,
- 0,
- "console",
- NULL,
- xencons_priv_interrupt,
- sc, INTR_TYPE_TTY, NULL);
+ VIRQ_CONSOLE,
+ 0,
+ "console",
+ NULL,
+ xencons_priv_interrupt,
+ INTR_TYPE_TTY, NULL);
KASSERT(error >= 0, ("can't register console interrupt"));
}
-
/* register handler to flush console on shutdown */
if ((EVENTHANDLER_REGISTER(shutdown_post_sync, xc_shutdown,
NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
@@ -270,7 +277,11 @@ xencons_rx(char *buf, unsigned len)
int i;
struct tty *tp = xccons;
- if (xen_console_up) {
+ if (xen_console_up
+#ifdef DDB
+ && !kdb_active
+#endif
+ ) {
tty_lock(tp);
for (i = 0; i < len; i++)
ttydisc_rint(tp, buf[i], 0);
@@ -423,12 +434,3 @@ xcons_force_flush(void)
}
DRIVER_MODULE(xc, nexus, xc_driver, xc_devclass, 0, 0);
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 8
- * tab-width: 4
- * indent-tabs-mode: t
- * End:
- */
diff --git a/sys/dev/xen/console/xencons_ring.c b/sys/dev/xen/console/xencons_ring.c
index 596b5de48d36..fc9522e4f44b 100644
--- a/sys/dev/xen/console/xencons_ring.c
+++ b/sys/dev/xen/console/xencons_ring.c
@@ -13,19 +13,24 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/bus.h>
+#include <sys/cons.h>
+
#include <machine/stdarg.h>
#include <machine/xen/xen-os.h>
#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <sys/cons.h>
+#include <xen/xen_intr.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/console.h>
#include <dev/xen/console/xencons_ring.h>
#include <xen/evtchn.h>
#include <xen/interface/io/console.h>
-
#define console_evtchn console.domU.evtchn
+static unsigned int console_irq;
extern char *console_page;
extern struct mtx cn_mtx;
@@ -60,7 +65,8 @@ xencons_ring_send(const char *data, unsigned len)
sent = 0;
mb();
- PANIC_IF((prod - cons) > sizeof(intf->out));
+ KASSERT((prod - cons) <= sizeof(intf->out),
+ ("console send ring inconsistent"));
while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
@@ -119,15 +125,18 @@ xencons_ring_init(void)
return 0;
err = bind_caller_port_to_irqhandler(xen_start_info->console_evtchn,
- "xencons", xencons_handle_input, NULL,
- INTR_TYPE_MISC | INTR_MPSAFE, NULL);
+ "xencons", xencons_handle_input, NULL,
+ INTR_TYPE_MISC | INTR_MPSAFE, &console_irq);
if (err) {
return err;
}
return 0;
}
-#ifdef notyet
+
+extern void xencons_suspend(void);
+extern void xencons_resume(void);
+
void
xencons_suspend(void)
{
@@ -135,7 +144,7 @@ xencons_suspend(void)
if (!xen_start_info->console_evtchn)
return;
- unbind_evtchn_from_irqhandler(xen_start_info->console_evtchn, NULL);
+ unbind_from_irqhandler(console_irq);
}
void
@@ -144,7 +153,7 @@ xencons_resume(void)
(void)xencons_ring_init();
}
-#endif
+
/*
* Local variables:
* mode: C
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index dbf50137b0b8..a70c47cffd82 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -24,11 +24,11 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/sockio.h>
#include <sys/mbuf.h>
-#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/socket.h>
+#include <sys/sysctl.h>
#include <sys/queue.h>
#include <sys/sx.h>
@@ -47,6 +47,10 @@ __FBSDID("$FreeBSD$");
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/if_ether.h>
+#if __FreeBSD_version >= 700000
+#include <netinet/tcp.h>
+#include <netinet/tcp_lro.h>
+#endif
#include <vm/vm.h>
#include <vm/pmap.h>
@@ -63,23 +67,42 @@ __FBSDID("$FreeBSD$");
#include <machine/intr_machdep.h>
#include <machine/xen/xen-os.h>
+#include <machine/xen/xenfunc.h>
#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <xen/evtchn.h>
#include <xen/gnttab.h>
#include <xen/interface/memory.h>
-#include <dev/xen/netfront/mbufq.h>
-#include <machine/xen/features.h>
#include <xen/interface/io/netif.h>
#include <xen/xenbus/xenbusvar.h>
+#include <dev/xen/netfront/mbufq.h>
+
#include "xenbus_if.h"
+#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO)
+
#define GRANT_INVALID_REF 0
#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
+#if __FreeBSD_version >= 700000
+/*
+ * Should the driver do LRO on the RX end
+ * this can be toggled on the fly, but the
+ * interface must be reset (down/up) for it
+ * to take effect.
+ */
+static int xn_enable_lro = 1;
+TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro);
+#else
+
+#define IFCAP_TSO4 0
+#define CSUM_TSO 0
+
+#endif
+
#ifdef CONFIG_XEN
static int MODPARM_rx_copy = 0;
module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
@@ -92,6 +115,7 @@ static const int MODPARM_rx_copy = 1;
static const int MODPARM_rx_flip = 0;
#endif
+#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2)
#define RX_COPY_THRESHOLD 256
#define net_ratelimit() 0
@@ -192,6 +216,9 @@ struct net_device_stats
struct netfront_info {
struct ifnet *xn_ifp;
+#if __FreeBSD_version >= 700000
+ struct lro_ctrl xn_lro;
+#endif
struct net_device_stats stats;
u_int tx_full;
@@ -329,31 +356,12 @@ xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri)
printf("[XEN] " fmt, ##args)
#define WPRINTK(fmt, args...) \
printf("[XEN] " fmt, ##args)
+#if 0
#define DPRINTK(fmt, args...) \
printf("[XEN] %s: " fmt, __func__, ##args)
-
-
-static __inline struct mbuf*
-makembuf (struct mbuf *buf)
-{
- struct mbuf *m = NULL;
-
- MGETHDR (m, M_DONTWAIT, MT_DATA);
-
- if (! m)
- return 0;
-
- M_MOVE_PKTHDR(m, buf);
-
- m_cljget(m, M_DONTWAIT, MJUMPAGESIZE);
- m->m_pkthdr.len = buf->m_pkthdr.len;
- m->m_len = buf->m_len;
- m_copydata(buf, 0, buf->m_pkthdr.len, mtod(m,caddr_t) );
-
- m->m_ext.ext_arg1 = (caddr_t *)(uintptr_t)(vtophys(mtod(m,caddr_t)) >> PAGE_SHIFT);
-
- return m;
-}
+#else
+#define DPRINTK(fmt, args...)
+#endif
/**
* Read the 'mac' node at the given device's node in the store, and parse that
@@ -414,6 +422,13 @@ netfront_attach(device_t dev)
return err;
}
+#if __FreeBSD_version >= 700000
+ SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
+ SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+ OID_AUTO, "enable_lro", CTLTYPE_INT|CTLFLAG_RW,
+ &xn_enable_lro, 0, "Large Receive Offload");
+#endif
+
return 0;
}
@@ -489,17 +504,12 @@ talk_to_backend(device_t dev, struct netfront_info *info)
message = "writing feature-rx-notify";
goto abort_transaction;
}
- err = xenbus_printf(xbt, node, "feature-no-csum-offload", "%d", 1);
- if (err) {
- message = "writing feature-no-csum-offload";
- goto abort_transaction;
- }
err = xenbus_printf(xbt, node, "feature-sg", "%d", 1);
if (err) {
message = "writing feature-sg";
goto abort_transaction;
}
-#ifdef HAVE_TSO
+#if __FreeBSD_version >= 700000
err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1);
if (err) {
message = "writing feature-gso-tcpv4";
@@ -569,7 +579,7 @@ setup_device(device_t dev, struct netfront_info *info)
goto fail;
error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev),
- "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq);
+ "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq);
if (error) {
xenbus_dev_fatal(dev, error,
@@ -587,6 +597,24 @@ setup_device(device_t dev, struct netfront_info *info)
}
/**
+ * If this interface has an ipv4 address, send an arp for it. This
+ * helps to get the network going again after migrating hosts.
+ */
+static void
+netfront_send_fake_arp(device_t dev, struct netfront_info *info)
+{
+ struct ifnet *ifp;
+ struct ifaddr *ifa;
+
+ ifp = info->xn_ifp;
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family == AF_INET) {
+ arp_ifinit(ifp, ifa);
+ }
+ }
+}
+
+/**
* Callback received when the backend's state changes.
*/
static void
@@ -611,9 +639,7 @@ netfront_backend_changed(device_t dev, XenbusState newstate)
if (network_connect(sc) != 0)
break;
xenbus_set_state(dev, XenbusStateConnected);
-#ifdef notyet
- (void)send_fake_arp(netdev);
-#endif
+ netfront_send_fake_arp(dev, sc);
break;
case XenbusStateClosing:
xenbus_set_state(dev, XenbusStateClosed);
@@ -851,6 +877,10 @@ static void
xn_rxeof(struct netfront_info *np)
{
struct ifnet *ifp;
+#if __FreeBSD_version >= 700000
+ struct lro_ctrl *lro = &np->xn_lro;
+ struct lro_entry *queued;
+#endif
struct netfront_rx_info rinfo;
struct netif_rx_response *rx = &rinfo.rx;
struct netif_extra_info *extras = rinfo.extras;
@@ -945,13 +975,35 @@ xn_rxeof(struct netfront_info *np)
* Do we really need to drop the rx lock?
*/
XN_RX_UNLOCK(np);
- /* Pass it up. */
+#if __FreeBSD_version >= 700000
+ /* Use LRO if possible */
+ if ((ifp->if_capenable & IFCAP_LRO) == 0 ||
+ lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) {
+ /*
+ * If LRO fails, pass up to the stack
+ * directly.
+ */
+ (*ifp->if_input)(ifp, m);
+ }
+#else
(*ifp->if_input)(ifp, m);
+#endif
XN_RX_LOCK(np);
}
np->rx.rsp_cons = i;
+#if __FreeBSD_version >= 700000
+ /*
+ * Flush any outstanding LRO work
+ */
+ while (!SLIST_EMPTY(&lro->lro_active)) {
+ queued = SLIST_FIRST(&lro->lro_active);
+ SLIST_REMOVE_HEAD(&lro->lro_active, next);
+ tcp_lro_flush(lro, queued);
+ }
+#endif
+
#if 0
/* If we get a callback with very few responses, reduce fill target. */
/* NB. Note exponential increase, linear decrease. */
@@ -972,6 +1024,7 @@ xn_txeof(struct netfront_info *np)
RING_IDX i, prod;
unsigned short id;
struct ifnet *ifp;
+ netif_tx_response_t *txr;
struct mbuf *m;
XN_TX_LOCK_ASSERT(np);
@@ -987,10 +1040,19 @@ xn_txeof(struct netfront_info *np)
rmb(); /* Ensure we see responses up to 'rp'. */
for (i = np->tx.rsp_cons; i != prod; i++) {
- id = RING_GET_RESPONSE(&np->tx, i)->id;
+ txr = RING_GET_RESPONSE(&np->tx, i);
+ if (txr->status == NETIF_RSP_NULL)
+ continue;
+
+ id = txr->id;
m = np->xn_cdata.xn_tx_chain[id];
- ifp->if_opackets++;
+ /*
+ * Increment packet count if this is the last
+ * mbuf of the chain.
+ */
+ if (!m->m_next)
+ ifp->if_opackets++;
KASSERT(m != NULL, ("mbuf not found in xn_tx_chain"));
M_ASSERTVALID(m);
if (unlikely(gnttab_query_foreign_access(
@@ -1008,7 +1070,7 @@ xn_txeof(struct netfront_info *np)
np->xn_cdata.xn_tx_chain[id] = NULL;
add_id_to_freelist(np->xn_cdata.xn_tx_chain, id);
- m_freem(m);
+ m_free(m);
}
np->tx.rsp_cons = prod;
@@ -1235,12 +1297,11 @@ xennet_get_responses(struct netfront_info *np,
gnttab_release_grant_reference(&np->gref_rx_head, ref);
next:
- if (m == NULL)
- break;
-
- m->m_len = rx->status;
- m->m_data += rx->offset;
- m0->m_pkthdr.len += rx->status;
+ if (m != NULL) {
+ m->m_len = rx->status;
+ m->m_data += rx->offset;
+ m0->m_pkthdr.len += rx->status;
+ }
if (!(rx->flags & NETRXF_more_data))
break;
@@ -1304,13 +1365,14 @@ xn_start_locked(struct ifnet *ifp)
{
int otherend_id;
unsigned short id;
- struct mbuf *m_head, *new_m;
+ struct mbuf *m_head, *m;
struct netfront_info *sc;
netif_tx_request_t *tx;
+ netif_extra_info_t *extra;
RING_IDX i;
grant_ref_t ref;
u_long mfn, tx_bytes;
- int notify;
+ int notify, nfrags;
sc = ifp->if_softc;
otherend_id = xenbus_get_otherend_id(sc->xbdev);
@@ -1330,36 +1392,96 @@ xn_start_locked(struct ifnet *ifp)
break;
}
- id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain);
+
+ /*
+ * Defragment the mbuf if necessary.
+ */
+ for (m = m_head, nfrags = 0; m; m = m->m_next)
+ nfrags++;
+ if (nfrags > MAX_SKB_FRAGS) {
+ m = m_defrag(m_head, M_DONTWAIT);
+ if (!m) {
+ m_freem(m_head);
+ break;
+ }
+ m_head = m;
+ }
/*
* Start packing the mbufs in this chain into
* the fragment pointers. Stop when we run out
* of fragments or hit the end of the mbuf chain.
*/
- new_m = makembuf(m_head);
- tx = RING_GET_REQUEST(&sc->tx, i);
- tx->id = id;
- ref = gnttab_claim_grant_reference(&sc->gref_tx_head);
- KASSERT((short)ref >= 0, ("Negative ref"));
- mfn = virt_to_mfn(mtod(new_m, vm_offset_t));
- gnttab_grant_foreign_access_ref(ref, otherend_id,
- mfn, GNTMAP_readonly);
- tx->gref = sc->grant_tx_ref[id] = ref;
- tx->size = new_m->m_pkthdr.len;
-#if 0
- tx->flags = (skb->ip_summed == CHECKSUM_HW) ? NETTXF_csum_blank : 0;
-#endif
- tx->flags = 0;
- new_m->m_next = NULL;
- new_m->m_nextpkt = NULL;
+ m = m_head;
+ extra = NULL;
+ for (m = m_head; m; m = m->m_next) {
+ tx = RING_GET_REQUEST(&sc->tx, i);
+ id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain);
+ sc->xn_cdata.xn_tx_chain[id] = m;
+ tx->id = id;
+ ref = gnttab_claim_grant_reference(&sc->gref_tx_head);
+ KASSERT((short)ref >= 0, ("Negative ref"));
+ mfn = virt_to_mfn(mtod(m, vm_offset_t));
+ gnttab_grant_foreign_access_ref(ref, otherend_id,
+ mfn, GNTMAP_readonly);
+ tx->gref = sc->grant_tx_ref[id] = ref;
+ tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1);
+ tx->flags = 0;
+ if (m == m_head) {
+ /*
+ * The first fragment has the entire packet
+ * size, subsequent fragments have just the
+ * fragment size. The backend works out the
+ * true size of the first fragment by
+ * subtracting the sizes of the other
+ * fragments.
+ */
+ tx->size = m->m_pkthdr.len;
- m_freem(m_head);
+ /*
+ * The first fragment contains the
+ * checksum flags and is optionally
+ * followed by extra data for TSO etc.
+ */
+ if (m->m_pkthdr.csum_flags
+ & CSUM_DELAY_DATA) {
+ tx->flags |= (NETTXF_csum_blank
+ | NETTXF_data_validated);
+ }
+#if __FreeBSD_version >= 700000
+ if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+ struct netif_extra_info *gso =
+ (struct netif_extra_info *)
+ RING_GET_REQUEST(&sc->tx, ++i);
+
+ if (extra)
+ extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+ else
+ tx->flags |= NETTXF_extra_info;
+
+ gso->u.gso.size = m->m_pkthdr.tso_segsz;
+ gso->u.gso.type =
+ XEN_NETIF_GSO_TYPE_TCPV4;
+ gso->u.gso.pad = 0;
+ gso->u.gso.features = 0;
+
+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+ gso->flags = 0;
+ extra = gso;
+ }
+#endif
+ } else {
+ tx->size = m->m_len;
+ }
+ if (m->m_next) {
+ tx->flags |= NETTXF_more_data;
+ i++;
+ }
+ }
- sc->xn_cdata.xn_tx_chain[id] = new_m;
- BPF_MTAP(ifp, new_m);
+ BPF_MTAP(ifp, m_head);
- sc->stats.tx_bytes += new_m->m_pkthdr.len;
+ sc->stats.tx_bytes += m_head->m_pkthdr.len;
sc->stats.tx_packets++;
}
@@ -1445,9 +1567,9 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
xn_ifinit_locked(sc);
arp_ifinit(ifp, ifa);
- XN_UNLOCK(sc);
+ XN_UNLOCK(sc);
} else {
- XN_UNLOCK(sc);
+ XN_UNLOCK(sc);
error = ether_ioctl(ifp, cmd, data);
}
break;
@@ -1501,12 +1623,39 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
break;
case SIOCSIFCAP:
mask = ifr->ifr_reqcap ^ ifp->if_capenable;
- if (mask & IFCAP_HWCSUM) {
- if (IFCAP_HWCSUM & ifp->if_capenable)
- ifp->if_capenable &= ~IFCAP_HWCSUM;
- else
- ifp->if_capenable |= IFCAP_HWCSUM;
+ if (mask & IFCAP_TXCSUM) {
+ if (IFCAP_TXCSUM & ifp->if_capenable) {
+ ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
+ ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
+ | CSUM_IP | CSUM_TSO);
+ } else {
+ ifp->if_capenable |= IFCAP_TXCSUM;
+ ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP
+ | CSUM_IP);
+ }
}
+ if (mask & IFCAP_RXCSUM) {
+ ifp->if_capenable ^= IFCAP_RXCSUM;
+ }
+#if __FreeBSD_version >= 700000
+ if (mask & IFCAP_TSO4) {
+ if (IFCAP_TSO4 & ifp->if_capenable) {
+ ifp->if_capenable &= ~IFCAP_TSO4;
+ ifp->if_hwassist &= ~CSUM_TSO;
+ } else if (IFCAP_TXCSUM & ifp->if_capenable) {
+ ifp->if_capenable |= IFCAP_TSO4;
+ ifp->if_hwassist |= CSUM_TSO;
+ } else {
+ DPRINTK("Xen requires tx checksum offload"
+ " be enabled to use TSO\n");
+ error = EINVAL;
+ }
+ }
+ if (mask & IFCAP_LRO) {
+ ifp->if_capenable ^= IFCAP_LRO;
+
+ }
+#endif
error = 0;
break;
case SIOCADDMULTI:
@@ -1715,11 +1864,21 @@ create_netdev(device_t dev)
ifp->if_mtu = ETHERMTU;
ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1;
-#ifdef notyet
ifp->if_hwassist = XN_CSUM_FEATURES;
ifp->if_capabilities = IFCAP_HWCSUM;
+#if __FreeBSD_version >= 700000
+ ifp->if_capabilities |= IFCAP_TSO4;
+ if (xn_enable_lro) {
+ int err = tcp_lro_init(&np->xn_lro);
+ if (err) {
+ device_printf(dev, "LRO initialization failed\n");
+ goto exit;
+ }
+ np->xn_lro.ifp = ifp;
+ ifp->if_capabilities |= IFCAP_LRO;
+ }
+#endif
ifp->if_capenable = ifp->if_capabilities;
-#endif
ether_ifattach(ifp, np->mac);
callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE);
diff --git a/sys/dev/xen/xenpci/evtchn.c b/sys/dev/xen/xenpci/evtchn.c
new file mode 100644
index 000000000000..bdf3ad155722
--- /dev/null
+++ b/sys/dev/xen/xenpci/evtchn.c
@@ -0,0 +1,418 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * A simplified event channel for para-drivers in unmodified linux
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@intel.com>
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/interrupt.h>
+#include <sys/pcpu.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/xen/xenvar.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_intr.h>
+#include <xen/evtchn.h>
+#include <sys/smp.h>
+
+#include <dev/xen/xenpci/xenpcivar.h>
+
+static inline unsigned long __ffs(unsigned long word)
+{
+ __asm__("bsfq %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+#define is_valid_evtchn(x) ((x) != 0)
+#define evtchn_from_irq(x) (irq_evtchn[irq].evtchn)
+
+static struct {
+ struct mtx lock;
+ driver_intr_t *handler;
+ void *arg;
+ int evtchn;
+ int close:1; /* close on unbind_from_irqhandler()? */
+ int inuse:1;
+ int in_handler:1;
+ int mpsafe:1;
+} irq_evtchn[256];
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+ [0 ... NR_EVENT_CHANNELS-1] = -1 };
+
+static struct mtx irq_alloc_lock;
+static device_t xenpci_device;
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+
+static unsigned int
+alloc_xen_irq(void)
+{
+ static int warned;
+ unsigned int irq;
+
+ mtx_lock(&irq_alloc_lock);
+
+ for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) {
+ if (irq_evtchn[irq].inuse)
+ continue;
+ irq_evtchn[irq].inuse = 1;
+ mtx_unlock(&irq_alloc_lock);
+ return irq;
+ }
+
+ if (!warned) {
+ warned = 1;
+ printf("alloc_xen_irq: No available IRQ to bind to: "
+ "increase irq_evtchn[] size in evtchn.c.\n");
+ }
+
+ mtx_unlock(&irq_alloc_lock);
+
+ return -ENOSPC;
+}
+
+static void
+free_xen_irq(int irq)
+{
+
+ mtx_lock(&irq_alloc_lock);
+ irq_evtchn[irq].inuse = 0;
+ mtx_unlock(&irq_alloc_lock);
+}
+
+int
+irq_to_evtchn_port(int irq)
+{
+
+ return irq_evtchn[irq].evtchn;
+}
+
+void
+mask_evtchn(int port)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ synch_set_bit(port, &s->evtchn_mask[0]);
+}
+
+void
+unmask_evtchn(int port)
+{
+ evtchn_unmask_t op = { .port = port };
+
+ HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &op);
+}
+
+int
+bind_listening_port_to_irqhandler(unsigned int remote_domain,
+ const char *devname, driver_intr_t handler, void *arg,
+ unsigned long irqflags, unsigned int *irqp)
+{
+ struct evtchn_alloc_unbound alloc_unbound;
+ unsigned int irq;
+ int error;
+
+ irq = alloc_xen_irq();
+ if (irq < 0)
+ return irq;
+
+ mtx_lock(&irq_evtchn[irq].lock);
+
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = remote_domain;
+ error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (error) {
+ mtx_unlock(&irq_evtchn[irq].lock);
+ free_xen_irq(irq);
+ return (-error);
+ }
+
+ irq_evtchn[irq].handler = handler;
+ irq_evtchn[irq].arg = arg;
+ irq_evtchn[irq].evtchn = alloc_unbound.port;
+ irq_evtchn[irq].close = 1;
+ irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0;
+
+ evtchn_to_irq[alloc_unbound.port] = irq;
+
+ unmask_evtchn(alloc_unbound.port);
+
+ mtx_unlock(&irq_evtchn[irq].lock);
+
+ if (irqp)
+ *irqp = irq;
+ return (0);
+}
+
+int
+bind_caller_port_to_irqhandler(unsigned int caller_port,
+ const char *devname, driver_intr_t handler, void *arg,
+ unsigned long irqflags, unsigned int *irqp)
+{
+ unsigned int irq;
+
+ irq = alloc_xen_irq();
+ if (irq < 0)
+ return irq;
+
+ mtx_lock(&irq_evtchn[irq].lock);
+
+ irq_evtchn[irq].handler = handler;
+ irq_evtchn[irq].arg = arg;
+ irq_evtchn[irq].evtchn = caller_port;
+ irq_evtchn[irq].close = 0;
+ irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0;
+
+ evtchn_to_irq[caller_port] = irq;
+
+ unmask_evtchn(caller_port);
+
+ mtx_unlock(&irq_evtchn[irq].lock);
+
+ if (irqp)
+ *irqp = irq;
+ return (0);
+}
+
+void
+unbind_from_irqhandler(unsigned int irq)
+{
+ int evtchn;
+
+ mtx_lock(&irq_evtchn[irq].lock);
+
+ evtchn = evtchn_from_irq(irq);
+
+ if (is_valid_evtchn(evtchn)) {
+ evtchn_to_irq[evtchn] = -1;
+ mask_evtchn(evtchn);
+ if (irq_evtchn[irq].close) {
+ struct evtchn_close close = { .port = evtchn };
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
+ panic("EVTCHNOP_close failed");
+ }
+ }
+
+ irq_evtchn[irq].handler = NULL;
+ irq_evtchn[irq].evtchn = 0;
+
+ mtx_unlock(&irq_evtchn[irq].lock);
+
+ while (irq_evtchn[irq].in_handler)
+ cpu_relax();
+
+ free_xen_irq(irq);
+}
+
+void notify_remote_via_irq(int irq)
+{
+ int evtchn;
+
+ evtchn = evtchn_from_irq(irq);
+ if (is_valid_evtchn(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
+}
+
+static void
+evtchn_interrupt(void *arg)
+{
+ unsigned int l1i, l2i, port;
+ unsigned long masked_l1, masked_l2;
+ /* XXX: All events are bound to vcpu0 but irq may be redirected. */
+ int cpu = 0; /*smp_processor_id();*/
+ driver_intr_t *handler;
+ void *handler_arg;
+ int irq, handler_mpsafe;
+ shared_info_t *s = HYPERVISOR_shared_info;
+ vcpu_info_t *v = &s->vcpu_info[cpu];
+ struct pcpu *pc = pcpu_find(cpu);
+ unsigned long l1, l2;
+
+ v->evtchn_upcall_pending = 0;
+
+#if 0
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+ /* Clear master flag /before/ clearing selector flag. */
+ wmb();
+#endif
+#endif
+
+ l1 = atomic_readandclear_long(&v->evtchn_pending_sel);
+
+ l1i = pc->pc_last_processed_l1i;
+ l2i = pc->pc_last_processed_l2i;
+
+ while (l1 != 0) {
+
+ l1i = (l1i + 1) % LONG_BIT;
+ masked_l1 = l1 & ((~0UL) << l1i);
+
+ if (masked_l1 == 0) { /* if we masked out all events, wrap around to the beginning */
+ l1i = LONG_BIT - 1;
+ l2i = LONG_BIT - 1;
+ continue;
+ }
+ l1i = __ffs(masked_l1);
+
+ do {
+ l2 = active_evtchns(cpu, s, l1i);
+
+ l2i = (l2i + 1) % LONG_BIT;
+ masked_l2 = l2 & ((~0UL) << l2i);
+
+ if (masked_l2 == 0) { /* if we masked out all events, move on */
+ l2i = LONG_BIT - 1;
+ break;
+ }
+ l2i = __ffs(masked_l2);
+
+ /* process port */
+ port = (l1i * LONG_BIT) + l2i;
+ synch_clear_bit(port, &s->evtchn_pending[0]);
+
+ irq = evtchn_to_irq[port];
+ if (irq < 0)
+ continue;
+
+ mtx_lock(&irq_evtchn[irq].lock);
+ handler = irq_evtchn[irq].handler;
+ handler_arg = irq_evtchn[irq].arg;
+ handler_mpsafe = irq_evtchn[irq].mpsafe;
+ if (unlikely(handler == NULL)) {
+ printf("Xen IRQ%d (port %d) has no handler!\n",
+ irq, port);
+ mtx_unlock(&irq_evtchn[irq].lock);
+ continue;
+ }
+ irq_evtchn[irq].in_handler = 1;
+ mtx_unlock(&irq_evtchn[irq].lock);
+
+ //local_irq_enable();
+ if (!handler_mpsafe)
+ mtx_lock(&Giant);
+ handler(handler_arg);
+ if (!handler_mpsafe)
+ mtx_unlock(&Giant);
+ //local_irq_disable();
+
+ mtx_lock(&irq_evtchn[irq].lock);
+ irq_evtchn[irq].in_handler = 0;
+ mtx_unlock(&irq_evtchn[irq].lock);
+
+ /* if this is the final port processed, we'll pick up here+1 next time */
+ pc->pc_last_processed_l1i = l1i;
+ pc->pc_last_processed_l2i = l2i;
+
+ } while (l2i != LONG_BIT - 1);
+
+ l2 = active_evtchns(cpu, s, l1i);
+ if (l2 == 0) /* we handled all ports, so we can clear the selector bit */
+ l1 &= ~(1UL << l1i);
+ }
+}
+
+void
+irq_suspend(void)
+{
+ struct xenpci_softc *scp = device_get_softc(xenpci_device);
+
+ /*
+ * Take our interrupt handler out of the list of handlers
+ * that can handle this irq.
+ */
+ if (scp->intr_cookie != NULL) {
+ if (BUS_TEARDOWN_INTR(device_get_parent(xenpci_device),
+ xenpci_device, scp->res_irq, scp->intr_cookie) != 0)
+ printf("intr teardown failed.. continuing\n");
+ scp->intr_cookie = NULL;
+ }
+}
+
+void
+irq_resume(void)
+{
+ struct xenpci_softc *scp = device_get_softc(xenpci_device);
+ int evtchn, irq;
+
+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) {
+ mask_evtchn(evtchn);
+ evtchn_to_irq[evtchn] = -1;
+ }
+
+ for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
+ irq_evtchn[irq].evtchn = 0;
+
+ BUS_SETUP_INTR(device_get_parent(xenpci_device),
+ xenpci_device, scp->res_irq, INTR_TYPE_MISC,
+ NULL, evtchn_interrupt, NULL, &scp->intr_cookie);
+}
+
+int
+xenpci_irq_init(device_t device, struct xenpci_softc *scp)
+{
+ int irq, cpu;
+ int error;
+
+ mtx_init(&irq_alloc_lock, "xen-irq-lock", NULL, MTX_DEF);
+
+ for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
+ mtx_init(&irq_evtchn[irq].lock, "irq-evtchn", NULL, MTX_DEF);
+
+ for (cpu = 0; cpu < mp_ncpus; cpu++) {
+ pcpu_find(cpu)->pc_last_processed_l1i = LONG_BIT - 1;
+ pcpu_find(cpu)->pc_last_processed_l2i = LONG_BIT - 1;
+ }
+
+ error = BUS_SETUP_INTR(device_get_parent(device), device,
+ scp->res_irq, INTR_MPSAFE|INTR_TYPE_MISC, NULL, evtchn_interrupt,
+ NULL, &scp->intr_cookie);
+ if (error)
+ return (error);
+
+ xenpci_device = device;
+
+ return (0);
+}
diff --git a/sys/dev/xen/xenpci/machine_reboot.c b/sys/dev/xen/xenpci/machine_reboot.c
new file mode 100644
index 000000000000..40365545b23e
--- /dev/null
+++ b/sys/dev/xen/xenpci/machine_reboot.c
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 2008 Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/interrupt.h>
+
+#include <machine/atomic.h>
+#include <machine/xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_intr.h>
+
+#include <dev/xen/xenpci/xenpcivar.h>
+
+void
+xen_suspend()
+{
+ int suspend_cancelled;
+
+ if (DEVICE_SUSPEND(root_bus)) {
+ printf("xen_suspend: device_suspend failed\n");
+ return;
+ }
+
+ /*
+ * Make sure we don't change cpus or switch to some other
+ * thread. for the duration.
+ */
+ critical_enter();
+
+ /*
+ * Prevent any races with evtchn_interrupt() handler.
+ */
+ irq_suspend();
+ disable_intr();
+
+ suspend_cancelled = HYPERVISOR_suspend(0);
+ if (!suspend_cancelled)
+ xenpci_resume();
+
+ /*
+ * Re-enable interrupts and put the scheduler back to normal.
+ */
+ enable_intr();
+ critical_exit();
+
+ /*
+ * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or
+ * similar.
+ */
+ if (!suspend_cancelled)
+ DEVICE_RESUME(root_bus);
+}
diff --git a/sys/dev/xen/xenpci/xenpci.c b/sys/dev/xen/xenpci/xenpci.c
new file mode 100644
index 000000000000..2f2a79fff21d
--- /dev/null
+++ b/sys/dev/xen/xenpci/xenpci.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2008 Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/rman.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen/xen-os.h>
+#include <xen/features.h>
+#include <xen/hypervisor.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/hvm/params.h>
+
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/pmap.h>
+
+#include <dev/xen/xenpci/xenpcivar.h>
+
+/*
+ * These variables are used by the rest of the kernel to access the
+ * hypervisor.
+ */
+char *hypercall_stubs;
+shared_info_t *HYPERVISOR_shared_info;
+static vm_paddr_t shared_info_pa;
+
+/*
+ * This is used to find our platform device instance.
+ */
+static devclass_t xenpci_devclass;
+
+/*
+ * Return the CPUID base address for Xen functions.
+ */
+static uint32_t
+xenpci_cpuid_base(void)
+{
+ uint32_t base, regs[4];
+
+ for (base = 0x40000000; base < 0x40001000; base += 0x100) {
+ do_cpuid(base, regs);
+ if (!memcmp("XenVMMXenVMM", &regs[1], 12)
+ && (regs[0] - base) >= 2)
+ return (base);
+ }
+ return (0);
+}
+
+/*
+ * Allocate and fill in the hypcall page.
+ */
+static int
+xenpci_init_hypercall_stubs(device_t dev, struct xenpci_softc * scp)
+{
+ uint32_t base, regs[4];
+ int i;
+
+ base = xenpci_cpuid_base();
+ if (!base) {
+ device_printf(dev, "Xen platform device but not Xen VMM\n");
+ return (EINVAL);
+ }
+
+ if (bootverbose) {
+ do_cpuid(base + 1, regs);
+ device_printf(dev, "Xen version %d.%d.\n",
+ regs[0] >> 16, regs[0] & 0xffff);
+ }
+
+ /*
+ * Find the hypercall pages.
+ */
+ do_cpuid(base + 2, regs);
+
+ hypercall_stubs = malloc(regs[0] * PAGE_SIZE, M_TEMP, M_WAITOK);
+
+ for (i = 0; i < regs[0]; i++) {
+ wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i);
+ }
+
+ return (0);
+}
+
+/*
+ * After a resume, re-initialise the hypercall page.
+ */
+static void
+xenpci_resume_hypercall_stubs(device_t dev, struct xenpci_softc * scp)
+{
+ uint32_t base, regs[4];
+ int i;
+
+ base = xenpci_cpuid_base();
+
+ do_cpuid(base + 2, regs);
+ for (i = 0; i < regs[0]; i++) {
+ wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i);
+ }
+}
+
+/*
+ * Tell the hypervisor how to contact us for event channel callbacks.
+ */
+static void
+xenpci_set_callback(device_t dev)
+{
+ int irq;
+ uint64_t callback;
+ struct xen_hvm_param xhp;
+
+ irq = pci_get_irq(dev);
+ if (irq < 16) {
+ callback = irq;
+ } else {
+ callback = (pci_get_intpin(dev) - 1) & 3;
+ callback |= pci_get_slot(dev) << 11;
+ callback |= 1ull << 56;
+ }
+
+ xhp.domid = DOMID_SELF;
+ xhp.index = HVM_PARAM_CALLBACK_IRQ;
+ xhp.value = callback;
+ if (HYPERVISOR_hvm_op(HVMOP_set_param, &xhp))
+ panic("Can't set evtchn callback");
+}
+
+
+/*
+ * Deallocate anything allocated by xenpci_allocate_resources.
+ */
+static int
+xenpci_deallocate_resources(device_t dev)
+{
+ struct xenpci_softc *scp = device_get_softc(dev);
+
+ if (scp->res_irq != 0) {
+ bus_deactivate_resource(dev, SYS_RES_IRQ,
+ scp->rid_irq, scp->res_irq);
+ bus_release_resource(dev, SYS_RES_IRQ,
+ scp->rid_irq, scp->res_irq);
+ scp->res_irq = 0;
+ }
+ if (scp->res_memory != 0) {
+ bus_deactivate_resource(dev, SYS_RES_MEMORY,
+ scp->rid_memory, scp->res_memory);
+ bus_release_resource(dev, SYS_RES_MEMORY,
+ scp->rid_memory, scp->res_memory);
+ scp->res_memory = 0;
+ }
+
+ return (0);
+}
+
+/*
+ * Allocate irq and memory resources.
+ */
+static int
+xenpci_allocate_resources(device_t dev)
+{
+ struct xenpci_softc *scp = device_get_softc(dev);
+
+ scp->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
+ &scp->rid_irq, RF_SHAREABLE|RF_ACTIVE);
+ if (scp->res_irq == NULL)
+ goto errexit;
+
+ scp->rid_memory = PCIR_BAR(1);
+ scp->res_memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
+ &scp->rid_memory, RF_ACTIVE);
+ if (scp->res_memory == NULL)
+ goto errexit;
+ return (0);
+
+errexit:
+ /* Cleanup anything we may have assigned. */
+ xenpci_deallocate_resources(dev);
+ return (ENXIO); /* For want of a better idea. */
+}
+
+/*
+ * Allocate a physical address range from our mmio region.
+ */
+static int
+xenpci_alloc_space_int(struct xenpci_softc *scp, size_t sz,
+ vm_paddr_t *pa)
+{
+
+ if (scp->phys_next + sz > rman_get_end(scp->res_memory)) {
+ return (ENOMEM);
+ }
+
+ *pa = scp->phys_next;
+ scp->phys_next += sz;
+
+ return (0);
+}
+
+/*
+ * Allocate a physical address range from our mmio region.
+ */
+int
+xenpci_alloc_space(size_t sz, vm_paddr_t *pa)
+{
+ device_t dev = devclass_get_device(xenpci_devclass, 0);
+
+ if (dev) {
+ return (xenpci_alloc_space_int(device_get_softc(dev),
+ sz, pa));
+ } else {
+ return (ENOMEM);
+ }
+}
+
+/*
+ * Called very early in the resume sequence - reinitialise the various
+ * bits of Xen machinery including the hypercall page and the shared
+ * info page.
+ */
+void
+xenpci_resume()
+{
+ device_t dev = devclass_get_device(xenpci_devclass, 0);
+ struct xenpci_softc *scp = device_get_softc(dev);
+ struct xen_add_to_physmap xatp;
+
+ xenpci_resume_hypercall_stubs(dev, scp);
+
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = shared_info_pa >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ panic("HYPERVISOR_memory_op failed");
+
+ pmap_kenter((vm_offset_t) HYPERVISOR_shared_info, shared_info_pa);
+
+ xenpci_set_callback(dev);
+
+ gnttab_resume();
+ irq_resume();
+}
+
+/*
+ * Probe - just check device ID.
+ */
+static int
+xenpci_probe(device_t dev)
+{
+
+ if (pci_get_devid(dev) != 0x00015853)
+ return (ENXIO);
+
+ device_set_desc(dev, "Xen Platform Device");
+ return (bus_generic_probe(dev));
+}
+
+/*
+ * Attach - find resources and talk to Xen.
+ */
+static int
+xenpci_attach(device_t dev)
+{
+ int error;
+ struct xenpci_softc *scp = device_get_softc(dev);
+ struct xen_add_to_physmap xatp;
+ vm_offset_t shared_va;
+
+ error = xenpci_allocate_resources(dev);
+ if (error)
+ goto errexit;
+
+ scp->phys_next = rman_get_start(scp->res_memory);
+
+ error = xenpci_init_hypercall_stubs(dev, scp);
+ if (error)
+ goto errexit;
+
+ setup_xen_features();
+
+ xenpci_alloc_space_int(scp, PAGE_SIZE, &shared_info_pa);
+
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = shared_info_pa >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ panic("HYPERVISOR_memory_op failed");
+
+ shared_va = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
+ pmap_kenter(shared_va, shared_info_pa);
+ HYPERVISOR_shared_info = (void *) shared_va;
+
+ /*
+ * Hook the irq up to evtchn
+ */
+ xenpci_irq_init(dev, scp);
+ xenpci_set_callback(dev);
+
+ return (bus_generic_attach(dev));
+
+errexit:
+ /*
+ * Undo anything we may have done.
+ */
+ xenpci_deallocate_resources(dev);
+ return (error);
+}
+
+/*
+ * Detach - reverse anything done by attach.
+ */
+static int
+xenpci_detach(device_t dev)
+{
+ struct xenpci_softc *scp = device_get_softc(dev);
+ device_t parent = device_get_parent(dev);
+
+ /*
+ * Take our interrupt handler out of the list of handlers
+ * that can handle this irq.
+ */
+ if (scp->intr_cookie != NULL) {
+ if (BUS_TEARDOWN_INTR(parent, dev,
+ scp->res_irq, scp->intr_cookie) != 0)
+ printf("intr teardown failed.. continuing\n");
+ scp->intr_cookie = NULL;
+ }
+
+ /*
+ * Deallocate any system resources we may have
+ * allocated on behalf of this driver.
+ */
+ return (xenpci_deallocate_resources(dev));
+}
+
+static device_method_t xenpci_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, xenpci_probe),
+ DEVMETHOD(device_attach, xenpci_attach),
+ DEVMETHOD(device_detach, xenpci_detach),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus interface */
+ DEVMETHOD(bus_add_child, bus_generic_add_child),
+
+ { 0, 0 }
+};
+
+static driver_t xenpci_driver = {
+ "xenpci",
+ xenpci_methods,
+ sizeof(struct xenpci_softc),
+};
+
+DRIVER_MODULE(xenpci, pci, xenpci_driver, xenpci_devclass, 0, 0);
diff --git a/sys/dev/xen/xenpci/xenpcivar.h b/sys/dev/xen/xenpci/xenpcivar.h
new file mode 100644
index 000000000000..a57c080b31d3
--- /dev/null
+++ b/sys/dev/xen/xenpci/xenpcivar.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008 Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * One of these per allocated device.
+ */
+struct xenpci_softc {
+ int rid_ioport;
+ int rid_memory;
+ int rid_irq;
+ struct resource* res_memory; /* Resource for mem range. */
+ struct resource* res_irq; /* Resource for irq range. */
+ void *intr_cookie;
+
+ vm_paddr_t phys_next; /* next page from mem range */
+};
+
+extern int xenpci_irq_init(device_t device, struct xenpci_softc *scp);
+extern int xenpci_alloc_space(size_t sz, vm_paddr_t *pa);
+extern void xenpci_resume(void);
+extern void xen_suspend(void);
diff --git a/sys/i386/include/xen/xenpmap.h b/sys/i386/include/xen/xenpmap.h
index 17d1f9254e42..4bfd99e65e55 100644
--- a/sys/i386/include/xen/xenpmap.h
+++ b/sys/i386/include/xen/xenpmap.h
@@ -222,7 +222,11 @@ set_phys_to_machine(unsigned long pfn, unsigned long mfn)
xen_phys_machine[pfn] = mfn;
}
-
+static __inline int
+phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ return xen_phys_machine[pfn] != INVALID_P2M_ENTRY;
+}
#endif /* _XEN_XENPMAP_H_ */
diff --git a/sys/xen/evtchn/evtchn.c b/sys/xen/evtchn/evtchn.c
index 884270c666a6..61b738b0d981 100644
--- a/sys/xen/evtchn/evtchn.c
+++ b/sys/xen/evtchn/evtchn.c
@@ -13,56 +13,28 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/interrupt.h>
#include <sys/pcpu.h>
+#include <sys/smp.h>
#include <machine/cpufunc.h>
#include <machine/intr_machdep.h>
+
#include <machine/xen/xen-os.h>
+#include <machine/xen/xenvar.h>
#include <xen/xen_intr.h>
#include <machine/xen/synch_bitops.h>
#include <xen/evtchn.h>
#include <xen/hypervisor.h>
#include <sys/smp.h>
-
-
-/* linux helper functions that got sucked in
- * rename and move XXX
- */
-
-
-static inline int find_first_bit(const unsigned long *addr, unsigned size)
-{
- int d0, d1;
- int res;
-
- /* This looks at memory. Mark it volatile to tell gcc not to move it around */
- __asm__ __volatile__(
- "xorl %%eax,%%eax\n\t"
- "repe; scasl\n\t"
- "jz 1f\n\t"
- "leal -4(%%edi),%%edi\n\t"
- "bsfl (%%edi),%%eax\n"
- "1:\tsubl %%ebx,%%edi\n\t"
- "shll $3,%%edi\n\t"
- "addl %%edi,%%eax"
- :"=a" (res), "=&c" (d0), "=&D" (d1)
- :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
- return res;
-}
-
-#define min_t(type,x,y) \
- ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
-#define first_cpu(src) __first_cpu(&(src), NR_CPUS)
-static inline int __first_cpu(const xen_cpumask_t *srcp, int nbits)
-{
- return min_t(int, nbits, find_first_bit(srcp->bits, nbits));
-}
+#include <xen/xen_intr.h>
+#include <xen/evtchn.h>
static inline unsigned long __ffs(unsigned long word)
{
@@ -166,7 +138,7 @@ static int irq_bindcount[NR_IRQS];
#ifdef SMP
static uint8_t cpu_evtchn[NR_EVENT_CHANNELS];
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static unsigned long cpu_evtchn_mask[MAX_VIRT_CPUS][NR_EVENT_CHANNELS/LONG_BIT];
#define active_evtchns(cpu,sh,idx) \
((sh)->evtchn_pending[idx] & \
@@ -220,7 +192,7 @@ evtchn_do_upcall(struct trapframe *frame)
shared_info_t *s;
vcpu_info_t *vcpu_info;
- cpu = smp_processor_id();
+ cpu = PCPU_GET(cpuid);
s = HYPERVISOR_shared_info;
vcpu_info = &s->vcpu_info[cpu];
@@ -236,7 +208,7 @@ evtchn_do_upcall(struct trapframe *frame)
while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
l2i = __ffs(l2);
- port = (l1i * BITS_PER_LONG) + l2i;
+ port = (l1i * LONG_BIT) + l2i;
if ((irq = evtchn_to_irq[port]) != -1) {
struct intsrc *isrc = intr_lookup_source(irq);
/*
@@ -258,7 +230,7 @@ ipi_pcpu(unsigned int cpu, int vector)
{
int irq;
- irq = per_cpu(ipi_to_irq, cpu)[vector];
+ irq = PCPU_GET(ipi_to_irq[vector]);
notify_remote_via_irq(irq);
}
@@ -310,11 +282,12 @@ bind_local_port_to_irq(unsigned int local_port)
mtx_lock_spin(&irq_mapping_update_lock);
- PANIC_IF(evtchn_to_irq[local_port] != -1);
-
+ KASSERT(evtchn_to_irq[local_port] == -1,
+ ("evtchn_to_irq inconsistent"));
+
if ((irq = find_unbound_irq()) < 0) {
struct evtchn_close close = { .port = local_port };
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
+ HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
goto out;
}
@@ -368,21 +341,20 @@ bind_virq_to_irq(unsigned int virq, unsigned int cpu)
mtx_lock_spin(&irq_mapping_update_lock);
- if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+ if ((irq = pcpu_find(cpu)->pc_virq_to_irq[virq]) == -1) {
if ((irq = find_unbound_irq()) < 0)
goto out;
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
- &bind_virq) != 0);
+ HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
evtchn = bind_virq.port;
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
- per_cpu(virq_to_irq, cpu)[virq] = irq;
+ pcpu_find(cpu)->pc_virq_to_irq[virq] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
@@ -407,18 +379,18 @@ bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
mtx_lock_spin(&irq_mapping_update_lock);
- if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
+ if ((irq = pcpu_find(cpu)->pc_ipi_to_irq[ipi]) == -1) {
if ((irq = find_unbound_irq()) < 0)
goto out;
bind_ipi.vcpu = cpu;
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi) != 0);
+ HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
evtchn = bind_ipi.port;
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
- per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+ pcpu_find(cpu)->pc_ipi_to_irq[ipi] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
@@ -432,24 +404,27 @@ out:
}
-void
+static void
unbind_from_irq(int irq)
{
struct evtchn_close close;
int evtchn = evtchn_from_irq(irq);
+ int cpu;
mtx_lock_spin(&irq_mapping_update_lock);
if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
close.port = evtchn;
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0);
+ HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
switch (type_from_irq(irq)) {
case IRQT_VIRQ:
- per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))[index_from_irq(irq)] = -1;
+ cpu = cpu_from_evtchn(evtchn);
+ pcpu_find(cpu)->pc_virq_to_irq[index_from_irq(irq)] = -1;
break;
case IRQT_IPI:
- per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))[index_from_irq(irq)] = -1;
+ cpu = cpu_from_evtchn(evtchn);
+ pcpu_find(cpu)->pc_ipi_to_irq[index_from_irq(irq)] = -1;
break;
default:
break;
@@ -467,11 +442,8 @@ unbind_from_irq(int irq)
int
bind_caller_port_to_irqhandler(unsigned int caller_port,
- const char *devname,
- driver_intr_t handler,
- void *arg,
- unsigned long irqflags,
- unsigned int *irqp)
+ const char *devname, driver_intr_t handler, void *arg,
+ unsigned long irqflags, unsigned int *irqp)
{
unsigned int irq;
int error;
@@ -493,13 +465,9 @@ bind_caller_port_to_irqhandler(unsigned int caller_port,
}
int
-bind_listening_port_to_irqhandler(
- unsigned int remote_domain,
- const char *devname,
- driver_intr_t handler,
- void *arg,
- unsigned long irqflags,
- unsigned int *irqp)
+bind_listening_port_to_irqhandler(unsigned int remote_domain,
+ const char *devname, driver_intr_t handler, void *arg,
+ unsigned long irqflags, unsigned int *irqp)
{
unsigned int irq;
int error;
@@ -519,14 +487,10 @@ bind_listening_port_to_irqhandler(
}
int
-bind_interdomain_evtchn_to_irqhandler(
- unsigned int remote_domain,
- unsigned int remote_port,
- const char *devname,
- driver_filter_t filter,
- driver_intr_t handler,
- unsigned long irqflags,
- unsigned int *irqp)
+bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port, const char *devname,
+ driver_filter_t filter, driver_intr_t handler,
+ unsigned long irqflags, unsigned int *irqp)
{
unsigned int irq;
int error;
@@ -546,14 +510,9 @@ bind_interdomain_evtchn_to_irqhandler(
}
int
-bind_virq_to_irqhandler(unsigned int virq,
- unsigned int cpu,
- const char *devname,
- driver_filter_t filter,
- driver_intr_t handler,
- void *arg,
- unsigned long irqflags,
- unsigned int *irqp)
+bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ const char *devname, driver_filter_t filter, driver_intr_t handler,
+ unsigned long irqflags, unsigned int *irqp)
{
unsigned int irq;
int error;
@@ -573,12 +532,9 @@ bind_virq_to_irqhandler(unsigned int virq,
}
int
-bind_ipi_to_irqhandler(unsigned int ipi,
- unsigned int cpu,
- const char *devname,
- driver_filter_t filter,
- unsigned long irqflags,
- unsigned int *irqp)
+bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu,
+ const char *devname, driver_filter_t filter,
+ unsigned long irqflags, unsigned int *irqp)
{
unsigned int irq;
int error;
@@ -636,9 +592,9 @@ rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
}
-static void set_affinity_irq(unsigned irq, xen_cpumask_t dest)
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
{
- unsigned tcpu = first_cpu(dest);
+ unsigned tcpu = ffs(dest) - 1;
rebind_irq_to_cpu(irq, tcpu);
}
#endif
@@ -656,13 +612,11 @@ static void xenpic_dynirq_enable_source(struct intsrc *isrc);
static void xenpic_dynirq_disable_source(struct intsrc *isrc, int);
static void xenpic_dynirq_eoi_source(struct intsrc *isrc);
static void xenpic_dynirq_enable_intr(struct intsrc *isrc);
-static void xenpic_dynirq_disable_intr(struct intsrc *isrc);
static void xenpic_pirq_enable_source(struct intsrc *isrc);
static void xenpic_pirq_disable_source(struct intsrc *isrc, int);
static void xenpic_pirq_eoi_source(struct intsrc *isrc);
static void xenpic_pirq_enable_intr(struct intsrc *isrc);
-static void xenpic_pirq_disable_intr(struct intsrc *isrc);
static int xenpic_vector(struct intsrc *isrc);
@@ -677,7 +631,6 @@ struct pic xenpic_dynirq_template = {
.pic_disable_source = xenpic_dynirq_disable_source,
.pic_eoi_source = xenpic_dynirq_eoi_source,
.pic_enable_intr = xenpic_dynirq_enable_intr,
- .pic_disable_intr = xenpic_dynirq_disable_intr,
.pic_vector = xenpic_vector,
.pic_source_pending = xenpic_source_pending,
.pic_suspend = xenpic_suspend,
@@ -689,7 +642,6 @@ struct pic xenpic_pirq_template = {
.pic_disable_source = xenpic_pirq_disable_source,
.pic_eoi_source = xenpic_pirq_eoi_source,
.pic_enable_intr = xenpic_pirq_enable_intr,
- .pic_disable_intr = xenpic_pirq_disable_intr,
.pic_vector = xenpic_vector,
.pic_source_pending = xenpic_source_pending,
.pic_suspend = xenpic_suspend,
@@ -748,20 +700,6 @@ xenpic_dynirq_enable_intr(struct intsrc *isrc)
}
static void
-xenpic_dynirq_disable_intr(struct intsrc *isrc)
-{
- unsigned int irq;
- struct xenpic_intsrc *xp;
-
- xp = (struct xenpic_intsrc *)isrc;
- mtx_lock_spin(&irq_mapping_update_lock);
- xp->xp_masked = 1;
- irq = xenpic_vector(isrc);
- mask_evtchn(evtchn_from_irq(irq));
- mtx_unlock_spin(&irq_mapping_update_lock);
-}
-
-static void
xenpic_dynirq_eoi_source(struct intsrc *isrc)
{
unsigned int irq;
@@ -825,7 +763,7 @@ notify_remote_via_irq(int irq)
if (VALID_EVTCHN(evtchn))
notify_remote_via_evtchn(evtchn);
else
- panic("invalid evtchn");
+ panic("invalid evtchn %d", irq);
}
/* required for support of physical devices */
@@ -899,32 +837,6 @@ xenpic_pirq_enable_intr(struct intsrc *isrc)
}
static void
-xenpic_pirq_disable_intr(struct intsrc *isrc)
-{
- unsigned int irq;
- int evtchn;
- struct evtchn_close close;
-
- mtx_lock_spin(&irq_mapping_update_lock);
- irq = xenpic_vector(isrc);
- evtchn = evtchn_from_irq(irq);
-
- if (!VALID_EVTCHN(evtchn))
- goto done;
-
- mask_evtchn(evtchn);
-
- close.port = evtchn;
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0);
-
- bind_evtchn_to_cpu(evtchn, 0);
- evtchn_to_irq[evtchn] = -1;
- irq_info[irq] = IRQ_UNBOUND;
- done:
- mtx_unlock_spin(&irq_mapping_update_lock);
-}
-
-static void
xenpic_pirq_enable_source(struct intsrc *isrc)
{
int evtchn;
@@ -998,7 +910,7 @@ void
unmask_evtchn(int port)
{
shared_info_t *s = HYPERVISOR_shared_info;
- unsigned int cpu = smp_processor_id();
+ unsigned int cpu = PCPU_GET(cpuid);
vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
/* Slow path (hypercall) if this is a non-local port. */
@@ -1016,7 +928,7 @@ unmask_evtchn(int port)
* masked.
*/
if (synch_test_bit(port, &s->evtchn_pending) &&
- !synch_test_and_set_bit(port / BITS_PER_LONG,
+ !synch_test_and_set_bit(port / LONG_BIT,
&vcpu_info->evtchn_pending_sel)) {
vcpu_info->evtchn_upcall_pending = 1;
if (!vcpu_info->evtchn_upcall_mask)
@@ -1039,15 +951,21 @@ void irq_resume(void)
mask_evtchn(evtchn);
/* Check that no PIRQs are still bound. */
- for (pirq = 0; pirq < NR_PIRQS; pirq++)
- PANIC_IF(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
+ for (pirq = 0; pirq < NR_PIRQS; pirq++) {
+ KASSERT(irq_info[pirq_to_irq(pirq)] == IRQ_UNBOUND,
+ ("pirq_to_irq inconsistent"));
+ }
/* Secondary CPUs must have no VIRQ or IPI bindings. */
- for (cpu = 1; cpu < NR_CPUS; cpu++) {
- for (virq = 0; virq < NR_VIRQS; virq++)
- PANIC_IF(per_cpu(virq_to_irq, cpu)[virq] != -1);
- for (ipi = 0; ipi < NR_IPIS; ipi++)
- PANIC_IF(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
+ for (cpu = 1; cpu < MAX_VIRT_CPUS; cpu++) {
+ for (virq = 0; virq < NR_VIRQS; virq++) {
+ KASSERT(pcpu_find(cpu)->pc_virq_to_irq[virq] == -1,
+ ("virq_to_irq inconsistent"));
+ }
+ for (ipi = 0; ipi < NR_IPIS; ipi++) {
+ KASSERT(pcpu_find(cpu)->pc_ipi_to_irq[ipi] == -1,
+ ("ipi_to_irq inconsistent"));
+ }
}
/* No IRQ <-> event-channel mappings. */
@@ -1058,15 +976,16 @@ void irq_resume(void)
/* Primary CPU: rebind VIRQs automatically. */
for (virq = 0; virq < NR_VIRQS; virq++) {
- if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
+ if ((irq = pcpu_find(0)->pc_virq_to_irq[virq]) == -1)
continue;
- PANIC_IF(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
+ KASSERT(irq_info[irq] == mk_irq_info(IRQT_VIRQ, virq, 0),
+ ("irq_info inconsistent"));
/* Get a new binding from Xen. */
bind_virq.virq = virq;
bind_virq.vcpu = 0;
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq) != 0);
+ HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
evtchn = bind_virq.port;
/* Record the new mapping. */
@@ -1079,15 +998,16 @@ void irq_resume(void)
/* Primary CPU: rebind IPIs automatically. */
for (ipi = 0; ipi < NR_IPIS; ipi++) {
- if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
+ if ((irq = pcpu_find(0)->pc_ipi_to_irq[ipi]) == -1)
continue;
- PANIC_IF(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
+ KASSERT(irq_info[irq] == mk_irq_info(IRQT_IPI, ipi, 0),
+ ("irq_info inconsistent"));
/* Get a new binding from Xen. */
memset(&op, 0, sizeof(op));
bind_ipi.vcpu = 0;
- PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi) != 0);
+ HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
evtchn = bind_ipi.port;
/* Record the new mapping. */
@@ -1111,9 +1031,9 @@ evtchn_init(void *dummy __unused)
/* No VIRQ or IPI bindings. */
for (cpu = 0; cpu < mp_ncpus; cpu++) {
for (i = 0; i < NR_VIRQS; i++)
- per_cpu(virq_to_irq, cpu)[i] = -1;
+ pcpu_find(cpu)->pc_virq_to_irq[i] = -1;
for (i = 0; i < NR_IPIS; i++)
- per_cpu(ipi_to_irq, cpu)[i] = -1;
+ pcpu_find(cpu)->pc_ipi_to_irq[i] = -1;
}
/* No event-channel -> IRQ mappings. */
diff --git a/sys/xen/evtchn/evtchn_dev.c b/sys/xen/evtchn/evtchn_dev.c
index ea12860dbefb..4253d8a1700d 100644
--- a/sys/xen/evtchn/evtchn_dev.c
+++ b/sys/xen/evtchn/evtchn_dev.c
@@ -23,8 +23,6 @@ __FBSDID("$FreeBSD$");
#include <sys/fcntl.h>
#include <sys/ioccom.h>
-#include <machine/cpufunc.h>
-#include <machine/intr_machdep.h>
#include <machine/xen/xen-os.h>
#include <xen/xen_intr.h>
#include <machine/bus.h>
@@ -234,14 +232,14 @@ evtchn_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg,
__evtchn_reset_buffer_ring();
break;
case EVTCHN_BIND:
- if ( !synch_test_and_set_bit((int)arg, &bound_ports[0]) )
- unmask_evtchn((int)arg);
+ if ( !synch_test_and_set_bit((uintptr_t)arg, &bound_ports[0]) )
+ unmask_evtchn((uintptr_t)arg);
else
rc = EINVAL;
break;
case EVTCHN_UNBIND:
- if ( synch_test_and_clear_bit((int)arg, &bound_ports[0]) )
- mask_evtchn((int)arg);
+ if ( synch_test_and_clear_bit((uintptr_t)arg, &bound_ports[0]) )
+ mask_evtchn((uintptr_t)arg);
else
rc = EINVAL;
break;
@@ -383,12 +381,12 @@ evtchn_dev_init(void *dummy __unused)
/* (DEVFS) automatically destroy the symlink with its destination. */
devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle);
#endif
- printk("Event-channel device installed.\n");
+ if (bootverbose)
+ printf("Event-channel device installed.\n");
return 0;
}
-
SYSINIT(evtchn_dev_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, evtchn_dev_init, NULL);
diff --git a/sys/xen/features.c b/sys/xen/features.c
index 876a7d1e568f..f28fe049177c 100644
--- a/sys/xen/features.c
+++ b/sys/xen/features.c
@@ -1,10 +1,12 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+
#include <machine/xen/xen-os.h>
#include <xen/hypervisor.h>
-#include <machine/xen/features.h>
+#include <xen/features.h>
uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32] /* __read_mostly */;
diff --git a/sys/xen/features.h b/sys/xen/features.h
new file mode 100644
index 000000000000..b4cce2fd4b1b
--- /dev/null
+++ b/sys/xen/features.h
@@ -0,0 +1,20 @@
+/******************************************************************************
+ * features.h
+ *
+ * Query the features reported by Xen.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __ASM_XEN_FEATURES_H__
+#define __ASM_XEN_FEATURES_H__
+
+#include <xen/interface/version.h>
+
+extern void setup_xen_features(void);
+
+extern uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32];
+
+#define xen_feature(flag) (xen_features[flag])
+
+#endif /* __ASM_XEN_FEATURES_H__ */
diff --git a/sys/xen/gnttab.c b/sys/xen/gnttab.c
index 967565506dfc..d05790bbf84f 100644
--- a/sys/xen/gnttab.c
+++ b/sys/xen/gnttab.c
@@ -25,28 +25,20 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
-#include <vm/vm.h>
-#include <vm/vm_extern.h>
-
-#include <vm/vm_page.h>
-#include <vm/vm_kern.h>
+#include <machine/xen/xen-os.h>
#include <xen/hypervisor.h>
#include <machine/xen/synch_bitops.h>
-#include <xen/gnttab.h>
-#define cmpxchg(a, b, c) atomic_cmpset_int((volatile u_int *)(a),(b),(c))
+#include <xen/hypervisor.h>
+#include <xen/gnttab.h>
-#if 1
-#define ASSERT(_p) \
- if ( !(_p) ) { printk("Assertion '%s': line %d, file %s\n", \
- #_p , __LINE__, __FILE__); *(int*)0=0; }
-#else
-#define ASSERT(_p) ((void)0)
-#endif
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
-#define WPRINTK(fmt, args...) \
- printk("xen_grant: " fmt, ##args)
+#define cmpxchg(a, b, c) atomic_cmpset_int((volatile u_int *)(a),(b),(c))
/* External tools reserve first few grant table entries. */
#define NR_RESERVED_ENTRIES 8
@@ -72,14 +64,14 @@ static int gnttab_expand(unsigned int req_entries);
static int
get_free_entries(int count, int *entries)
{
- int ref, rc;
+ int ref, error;
grant_ref_t head;
mtx_lock(&gnttab_list_lock);
if ((gnttab_free_count < count) &&
- ((rc = gnttab_expand(count - gnttab_free_count)) != 0)) {
+ ((error = gnttab_expand(count - gnttab_free_count)) != 0)) {
mtx_unlock(&gnttab_list_lock);
- return (rc);
+ return (error);
}
ref = head = gnttab_free_head;
gnttab_free_count -= count;
@@ -163,6 +155,7 @@ void
gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
unsigned long frame, int readonly)
{
+
shared[ref].frame = frame;
shared[ref].domid = domid;
wmb();
@@ -213,7 +206,8 @@ gnttab_end_foreign_access(grant_ref_t ref, void *page)
}
int
-gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn,
+ grant_ref_t *result)
{
int error, ref;
@@ -223,7 +217,8 @@ gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
- return (ref);
+ *result = ref;
+ return (0);
}
void
@@ -261,7 +256,7 @@ gnttab_end_foreign_transfer_ref(grant_ref_t ref)
/* Read the frame number /after/ reading completion status. */
rmb();
frame = shared[ref].frame;
- PANIC_IF(frame == 0);
+ KASSERT(frame != 0, ("grant table inconsistent"));
return (frame);
}
@@ -320,6 +315,7 @@ gnttab_alloc_grant_references(uint16_t count, grant_ref_t *head)
int
gnttab_empty_grant_references(const grant_ref_t *private_head)
{
+
return (*private_head == GNTTAB_LIST_END);
}
@@ -331,20 +327,20 @@ gnttab_claim_grant_reference(grant_ref_t *private_head)
if (unlikely(g == GNTTAB_LIST_END))
return (ENOSPC);
*private_head = gnttab_entry(g);
-
return (g);
}
void
gnttab_release_grant_reference(grant_ref_t *private_head, grant_ref_t release)
{
+
gnttab_entry(release) = *private_head;
*private_head = release;
}
void
gnttab_request_free_callback(struct gnttab_free_callback *callback,
- void (*fn)(void *), void *arg, uint16_t count)
+ void (*fn)(void *), void *arg, uint16_t count)
{
mtx_lock(&gnttab_list_lock);
@@ -387,7 +383,8 @@ grow_gnttab_list(unsigned int more_frames)
for (i = nr_grant_frames; i < new_nr_grant_frames; i++)
{
- gnttab_list[i] = (grant_ref_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
+ gnttab_list[i] = (grant_ref_t *)
+ malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
if (!gnttab_list[i])
goto grow_nomem;
@@ -405,12 +402,12 @@ grow_gnttab_list(unsigned int more_frames)
check_free_callbacks();
- return 0;
+ return (0);
grow_nomem:
for ( ; i >= nr_grant_frames; i--)
free(gnttab_list[i], M_DEVBUF);
- return (-ENOMEM);
+ return (ENOMEM);
}
static unsigned int
@@ -464,6 +461,8 @@ unmap_pte_fn(pte_t *pte, struct page *pmd_page,
}
#endif
+#ifndef XENHVM
+
static int
gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
@@ -486,50 +485,117 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx)
free(frames, M_DEVBUF);
return (ENOSYS);
}
- PANIC_IF(rc || setup.status);
+ KASSERT(!(rc || setup.status),
+ ("unexpected result from grant_table_op"));
if (shared == NULL) {
vm_offset_t area;
area = kmem_alloc_nofault(kernel_map,
PAGE_SIZE * max_nr_grant_frames());
- PANIC_IF(area == 0);
+ KASSERT(area, ("can't allocate VM space for grant table"));
shared = (grant_entry_t *)area;
}
+
for (i = 0; i < nr_gframes; i++)
PT_SET_MA(((caddr_t)shared) + i*PAGE_SIZE,
((vm_paddr_t)frames[i]) << PAGE_SHIFT | PG_RW | PG_V);
free(frames, M_DEVBUF);
- return 0;
+ return (0);
}
int
gnttab_resume(void)
{
+
if (max_nr_grant_frames() < nr_grant_frames)
- return -ENOSYS;
- return gnttab_map(0, nr_grant_frames - 1);
+ return (ENOSYS);
+ return (gnttab_map(0, nr_grant_frames - 1));
}
int
gnttab_suspend(void)
{
- int i, pages;
+ int i;
+
+ for (i = 0; i < nr_grant_frames; i++)
+ pmap_kremove((vm_offset_t) shared + i * PAGE_SIZE);
+
+ return (0);
+}
+
+#else /* XENHVM */
- pages = (PAGE_SIZE*nr_grant_frames) >> PAGE_SHIFT;
+#include <dev/xen/xenpci/xenpcivar.h>
- for (i = 0; i < pages; i++)
- PT_SET_MA(shared + (i*PAGE_SIZE), (vm_paddr_t)0);
+static vm_paddr_t resume_frames;
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+ struct xen_add_to_physmap xatp;
+ unsigned int i = end_idx;
+
+ /*
+ * Loop backwards, so that the first hypercall has the largest index,
+ * ensuring that the table will grow only once.
+ */
+ do {
+ xatp.domid = DOMID_SELF;
+ xatp.idx = i;
+ xatp.space = XENMAPSPACE_grant_table;
+ xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ panic("HYPERVISOR_memory_op failed to map gnttab");
+ } while (i-- > start_idx);
+
+ if (shared == NULL) {
+ vm_offset_t area;
+
+ area = kmem_alloc_nofault(kernel_map,
+ PAGE_SIZE * max_nr_grant_frames());
+ KASSERT(area, ("can't allocate VM space for grant table"));
+ shared = (grant_entry_t *)area;
+ }
+
+ for (i = start_idx; i <= end_idx; i++) {
+ pmap_kenter((vm_offset_t) shared + i * PAGE_SIZE,
+ resume_frames + i * PAGE_SIZE);
+ }
return (0);
}
+int
+gnttab_resume(void)
+{
+ int error;
+ unsigned int max_nr_gframes, nr_gframes;
+
+ nr_gframes = nr_grant_frames;
+ max_nr_gframes = max_nr_grant_frames();
+ if (max_nr_gframes < nr_gframes)
+ return (ENOSYS);
+
+ if (!resume_frames) {
+ error = xenpci_alloc_space(PAGE_SIZE * max_nr_gframes,
+ &resume_frames);
+ if (error) {
+ printf("error mapping gnttab share frames\n");
+ return (error);
+ }
+ }
+
+ return (gnttab_map(0, nr_gframes - 1));
+}
+
+#endif
+
static int
gnttab_expand(unsigned int req_entries)
{
- int rc;
+ int error;
unsigned int cur, extra;
cur = nr_grant_frames;
@@ -538,10 +604,11 @@ gnttab_expand(unsigned int req_entries)
if (cur + extra > max_nr_grant_frames())
return (ENOSPC);
- if ((rc = gnttab_map(cur, cur + extra - 1)) == 0)
- rc = grow_gnttab_list(extra);
+ error = gnttab_map(cur, cur + extra - 1);
+ if (!error)
+ error = grow_gnttab_list(extra);
- return rc;
+ return (error);
}
int
@@ -552,7 +619,7 @@ gnttab_init()
unsigned int nr_init_grefs;
if (!is_running_on_xen())
- return -ENODEV;
+ return (ENODEV);
nr_grant_frames = 1;
boot_max_nr_grant_frames = __max_nr_grant_frames();
@@ -571,7 +638,8 @@ gnttab_init()
return (ENOMEM);
for (i = 0; i < nr_grant_frames; i++) {
- gnttab_list[i] = (grant_ref_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
+ gnttab_list[i] = (grant_ref_t *)
+ malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
if (gnttab_list[i] == NULL)
goto ini_nomem;
}
@@ -588,8 +656,10 @@ gnttab_init()
gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
gnttab_free_head = NR_RESERVED_ENTRIES;
- printk("Grant table initialized\n");
- return 0;
+ if (bootverbose)
+ printf("Grant table initialized\n");
+
+ return (0);
ini_nomem:
for (i--; i >= 0; i--)
diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h
index bcefbbc131bf..8348af5351f1 100644
--- a/sys/xen/gnttab.h
+++ b/sys/xen/gnttab.h
@@ -36,10 +36,12 @@
#ifndef __ASM_GNTTAB_H__
+#include <xen/interface/grant_table.h>
+
#include <xen/hypervisor.h>
#include <xen/interface/grant_table.h>
#include <machine/xen/xen-os.h>
-#include <machine/xen/features.h>
+#include <xen/features.h>
struct gnttab_free_callback {
struct gnttab_free_callback *next;
@@ -50,6 +52,10 @@ struct gnttab_free_callback {
int gnttab_init(void);
+/*
+ * Allocate a grant table reference and return it in *result. Returns
+ * zero on success or errno on error.
+ */
int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
int flags, grant_ref_t *result);
@@ -68,7 +74,7 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref);
*/
void gnttab_end_foreign_access(grant_ref_t ref, void *page);
-int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, grant_ref_t *result);
unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
@@ -104,6 +110,10 @@ void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
int gnttab_suspend(void);
int gnttab_resume(void);
+#if 0
+
+#include <xen/features.h>
+
static inline void
gnttab_set_map_op(struct gnttab_map_grant_ref *map, vm_paddr_t addr,
uint32_t flags, grant_ref_t ref, domid_t domid)
@@ -149,5 +159,6 @@ gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr,
unmap->handle = handle;
}
+#endif
#endif /* __ASM_GNTTAB_H__ */
diff --git a/sys/xen/hypervisor.h b/sys/xen/hypervisor.h
index 369b0c4d5b6c..0d93f66dbd18 100644
--- a/sys/xen/hypervisor.h
+++ b/sys/xen/hypervisor.h
@@ -8,11 +8,19 @@
* $FreeBSD$
*/
-#ifndef __HYPERVISOR_H__
-#define __HYPERVISOR_H__
+#ifndef __XEN_HYPERVISOR_H__
+#define __XEN_HYPERVISOR_H__
+
+#ifdef XENHVM
+
+#define is_running_on_xen() (HYPERVISOR_shared_info != NULL)
+
+#else
#define is_running_on_xen() 1
+#endif
+
#ifdef PAE
#ifndef CONFIG_X86_PAE
#define CONFIG_X86_PAE
@@ -27,6 +35,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/sched.h>
#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
#include <machine/xen/hypercall.h>
#if defined(__amd64__)
@@ -131,7 +140,7 @@ MULTI_update_va_mapping(
mcl->op = __HYPERVISOR_update_va_mapping;
mcl->args[0] = va;
#if defined(__amd64__)
- mcl->args[1] = new_val.pte;
+ mcl->args[1] = new_val;
#elif defined(PAE)
mcl->args[1] = (uint32_t)(new_val & 0xffffffff) ;
mcl->args[2] = (uint32_t)(new_val >> 32);
@@ -142,4 +151,4 @@ MULTI_update_va_mapping(
mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
}
-#endif /* __HYPERVISOR_H__ */
+#endif /* __XEN_HYPERVISOR_H__ */
diff --git a/sys/xen/interface/arch-x86/xen.h b/sys/xen/interface/arch-x86/xen.h
index 038048ef279d..2c878ef464c5 100644
--- a/sys/xen/interface/arch-x86/xen.h
+++ b/sys/xen/interface/arch-x86/xen.h
@@ -32,7 +32,8 @@
#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
#else
-#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+#error "using old handle"
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef type * __guest_handle_ ## name
#endif
@@ -50,7 +51,7 @@
#if defined(__i386__)
#include <xen/interface/arch-x86/xen-x86_32.h>
#elif defined(__x86_64__)
-#include "xen-x86_64.h"
+#include <xen/interface/arch-x86/xen-x86_64.h>
#endif
#ifndef __ASSEMBLY__
diff --git a/sys/xen/interface/hvm/params.h b/sys/xen/interface/hvm/params.h
index 5f75ed78e8a7..6befa78df8a0 100644
--- a/sys/xen/interface/hvm/params.h
+++ b/sys/xen/interface/hvm/params.h
@@ -21,7 +21,7 @@
#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
#define __XEN_PUBLIC_HVM_PARAMS_H__
-#include "hvm_op.h"
+#include <xen/interface/hvm/hvm_op.h>
/*
* Parameter space for HVMOP_{set,get}_param.
diff --git a/sys/xen/reboot.c b/sys/xen/reboot.c
new file mode 100644
index 000000000000..892dfbf3c91a
--- /dev/null
+++ b/sys/xen/reboot.c
@@ -0,0 +1,262 @@
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004-2006,2008 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <machine/xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
+#include <xen/xenbus/xenbusvar.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#ifdef XENHVM
+
+#include <dev/xen/xenpci/xenpcivar.h>
+
+#else
+
+static void xen_suspend(void);
+
+#endif
+
+static void
+shutdown_handler(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ char *str;
+ struct xenbus_transaction xbt;
+ int error, howto;
+
+ howto = 0;
+
+ again:
+ error = xenbus_transaction_start(&xbt);
+ if (error)
+ return;
+
+ error = xenbus_read(xbt, "control", "shutdown", NULL, (void **) &str);
+
+ /* Ignore read errors and empty reads. */
+ if (error || strlen(str) == 0) {
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ xenbus_write(xbt, "control", "shutdown", "");
+
+ error = xenbus_transaction_end(xbt, 0);
+ if (error == EAGAIN) {
+ free(str, M_DEVBUF);
+ goto again;
+ }
+
+ if (strcmp(str, "reboot") == 0)
+ howto = 0;
+ else if (strcmp(str, "poweroff") == 0)
+ howto |= (RB_POWEROFF | RB_HALT);
+ else if (strcmp(str, "halt") == 0)
+#ifdef XENHVM
+ /*
+ * We rely on acpi powerdown to halt the VM.
+ */
+ howto |= (RB_POWEROFF | RB_HALT);
+#else
+ howto |= RB_HALT;
+#endif
+ else if (strcmp(str, "suspend") == 0)
+ howto = -1;
+ else {
+ printf("Ignoring shutdown request: %s\n", str);
+ goto done;
+ }
+
+ if (howto == -1) {
+ xen_suspend();
+ goto done;
+ }
+
+ shutdown_nice(howto);
+ done:
+ free(str, M_DEVBUF);
+}
+
+#ifndef XENHVM
+
+/*
+ * In HV mode, we let acpi take care of halts and reboots.
+ */
+
+static void
+xen_shutdown_final(void *arg, int howto)
+{
+
+ if (howto & (RB_HALT | RB_POWEROFF))
+ HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+ else
+ HYPERVISOR_shutdown(SHUTDOWN_reboot);
+}
+
+#endif
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
+
+static void
+setup_shutdown_watcher(void *unused)
+{
+
+ if (register_xenbus_watch(&shutdown_watch))
+ printf("Failed to set shutdown watcher\n");
+#ifndef XENHVM
+ EVENTHANDLER_REGISTER(shutdown_final, xen_shutdown_final, NULL,
+ SHUTDOWN_PRI_LAST);
+#endif
+}
+
+SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL);
+
+#ifndef XENHVM
+
+extern void xencons_suspend(void);
+extern void xencons_resume(void);
+
+static void
+xen_suspend()
+{
+ int i, j, k, fpp;
+ unsigned long max_pfn, start_info_mfn;
+
+#ifdef SMP
+ cpumask_t map;
+ /*
+ * Bind us to CPU 0 and stop any other VCPUs.
+ */
+ mtx_lock_spin(&sched_lock);
+ sched_bind(curthread, 0);
+ mtx_unlock_spin(&sched_lock);
+ KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0"));
+
+ map = PCPU_GET(other_cpus) & ~stopped_cpus;
+ if (map)
+ stop_cpus(map);
+#endif
+
+ if (DEVICE_SUSPEND(root_bus) != 0) {
+ printf("xen_suspend: device_suspend failed\n");
+ if (map)
+ restart_cpus(map);
+ return;
+ }
+
+ local_irq_disable();
+
+ xencons_suspend();
+ gnttab_suspend();
+
+ max_pfn = HYPERVISOR_shared_info->arch.max_pfn;
+
+ void *shared_info = HYPERVISOR_shared_info;
+ HYPERVISOR_shared_info = NULL;
+ pmap_kremove((vm_offset_t) shared_info);
+ PT_UPDATES_FLUSH();
+
+ xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn);
+
+ /*
+ * We'll stop somewhere inside this hypercall. When it returns,
+ * we'll start resuming after the restore.
+ */
+ start_info_mfn = VTOMFN(xen_start_info);
+ pmap_suspend();
+ HYPERVISOR_suspend(start_info_mfn);
+ pmap_resume();
+
+ pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = shared_info;
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ VTOMFN(xen_pfn_to_mfn_frame_list_list);
+
+ fpp = PAGE_SIZE/sizeof(unsigned long);
+ for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+ if ((j % fpp) == 0) {
+ k++;
+ xen_pfn_to_mfn_frame_list_list[k] =
+ VTOMFN(xen_pfn_to_mfn_frame_list[k]);
+ j = 0;
+ }
+ xen_pfn_to_mfn_frame_list[k][j] =
+ VTOMFN(&xen_phys_machine[i]);
+ }
+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+
+ gnttab_resume();
+ irq_resume();
+ local_irq_enable();
+ xencons_resume();
+
+#ifdef CONFIG_SMP
+ for_each_cpu(i)
+ vcpu_prepare(i);
+
+#endif
+ /*
+ * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
+ * the VCPU hotplug callback can race with our vcpu_prepare
+ */
+ DEVICE_RESUME(root_bus);
+
+#ifdef SMP
+ sched_unbind(curthread);
+ if (map)
+ restart_cpus(map);
+#endif
+}
+
+#endif
diff --git a/sys/xen/xen_intr.h b/sys/xen/xen_intr.h
index 528fa7f40d80..68f594333fdd 100644
--- a/sys/xen/xen_intr.h
+++ b/sys/xen/xen_intr.h
@@ -29,37 +29,63 @@
#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
-/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */
-extern void unbind_from_irq(int irq);
+/*
+ * Dynamic binding of event channels and VIRQ sources to guest IRQ space.
+ */
+/*
+ * Bind a caller port event channel to an interrupt handler. If
+ * successful, the guest IRQ number is returned in *irqp. Return zero
+ * on success or errno otherwise.
+ */
extern int bind_caller_port_to_irqhandler(unsigned int caller_port,
const char *devname, driver_intr_t handler, void *arg,
unsigned long irqflags, unsigned int *irqp);
+
+/*
+ * Bind a listening port to an interrupt handler. If successful, the
+ * guest IRQ number is returned in *irqp. Return zero on success or
+ * errno otherwise.
+ */
extern int bind_listening_port_to_irqhandler(unsigned int remote_domain,
- const char *devname, driver_intr_t handler, void *arg, unsigned long irqflags,
- unsigned int *irqp);
+ const char *devname, driver_intr_t handler, void *arg,
+ unsigned long irqflags, unsigned int *irqp);
+
+/*
+ * Bind a VIRQ to an interrupt handler. If successful, the guest IRQ
+ * number is returned in *irqp. Return zero on success or errno
+ * otherwise.
+ */
extern int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
const char *devname, driver_filter_t filter, driver_intr_t handler,
void *arg, unsigned long irqflags, unsigned int *irqp);
-extern int bind_ipi_to_irqhandler(unsigned int ipi,
- unsigned int cpu,
- const char *devname,
- driver_filter_t handler,
- unsigned long irqflags,
- unsigned int *irqp);
-extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
- unsigned int remote_port,
- const char *devname,
- driver_filter_t filter,
- driver_intr_t handler,
- unsigned long irqflags,
- unsigned int *irqp);
+/*
+ * Bind an IPI to an interrupt handler. If successful, the guest
+ * IRQ number is returned in *irqp. Return zero on success or errno
+ * otherwise.
+ */
+extern int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu,
+ const char *devname, driver_filter_t filter,
+ unsigned long irqflags, unsigned int *irqp);
+/*
+ * Bind an interdomain event channel to an interrupt handler. If
+ * successful, the guest IRQ number is returned in *irqp. Return zero
+ * on success or errno otherwise.
+ */
+extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port, const char *devname,
+ driver_filter_t filter, driver_intr_t handler,
+ unsigned long irqflags, unsigned int *irqp);
+/*
+ * Unbind an interrupt handler using the guest IRQ number returned
+ * when it was bound.
+ */
+extern void unbind_from_irqhandler(unsigned int irq);
-extern void unbind_from_irqhandler(unsigned int evtchn);
-static __inline__ int irq_cannonicalize(int irq)
+static __inline__ int irq_cannonicalize(unsigned int irq)
{
return (irq == 2) ? 9 : irq;
}
diff --git a/sys/xen/xenbus/xenbus_probe.c b/sys/xen/xenbus/xenbus_probe.c
index 3d2cb4bb81ca..f04f8eca4638 100644
--- a/sys/xen/xenbus/xenbus_probe.c
+++ b/sys/xen/xenbus/xenbus_probe.c
@@ -565,7 +565,6 @@ xenbus_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
return (ENOENT);
}
-SYSCTL_DECL(_dev);
SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xen_store_evtchn, 0, "");
SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
diff --git a/sys/xen/xenbus/xenbus_xs.c b/sys/xen/xenbus/xenbus_xs.c
index 9e0f7798fcf7..806955678457 100644
--- a/sys/xen/xenbus/xenbus_xs.c
+++ b/sys/xen/xenbus/xenbus_xs.c
@@ -142,21 +142,17 @@ xs_read_reply(enum xsd_sockmsg_type *type, unsigned int *len, void **result)
mtx_lock(&xs_state.reply_lock);
while (TAILQ_EMPTY(&xs_state.reply_list)) {
- while (TAILQ_EMPTY(&xs_state.reply_list)) {
- error = mtx_sleep(&xs_state.reply_waitq,
- &xs_state.reply_lock,
- PCATCH, "xswait", hz/10);
- if (error && error != EWOULDBLOCK) {
- mtx_unlock(&xs_state.reply_lock);
- return (error);
- }
-
+ while (TAILQ_EMPTY(&xs_state.reply_list)) {
+ error = mtx_sleep(&xs_state.reply_waitq,
+ &xs_state.reply_lock,
+ PCATCH, "xswait", hz/10);
+ if (error && error != EWOULDBLOCK) {
+ mtx_unlock(&xs_state.reply_lock);
+ return (error);
}
-
-
}
+ }
-
msg = TAILQ_FIRST(&xs_state.reply_list);
TAILQ_REMOVE(&xs_state.reply_list, msg, list);
@@ -202,7 +198,8 @@ xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
sx_xlock(&xs_state.request_mutex);
- error = xb_write(msg, sizeof(*msg) + msg->len, &xs_state.request_mutex.lock_object);
+ error = xb_write(msg, sizeof(*msg) + msg->len,
+ &xs_state.request_mutex.lock_object);
if (error) {
msg->type = XS_ERROR;
} else {
@@ -243,7 +240,8 @@ xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type,
sx_xlock(&xs_state.request_mutex);
- error = xb_write(&msg, sizeof(msg), &xs_state.request_mutex.lock_object);
+ error = xb_write(&msg, sizeof(msg),
+ &xs_state.request_mutex.lock_object);
if (error) {
sx_xunlock(&xs_state.request_mutex);
printf("xs_talkv failed %d\n", error);
@@ -251,7 +249,8 @@ xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type,
}
for (i = 0; i < num_vecs; i++) {
- error = xb_write(iovec[i].iov_base, iovec[i].iov_len, &xs_state.request_mutex.lock_object);
+ error = xb_write(iovec[i].iov_base, iovec[i].iov_len,
+ &xs_state.request_mutex.lock_object);
if (error) {
sx_xunlock(&xs_state.request_mutex);
printf("xs_talkv failed %d\n", error);
@@ -791,7 +790,8 @@ xs_process_msg(enum xsd_sockmsg_type *type)
msg = malloc(sizeof(*msg), M_DEVBUF, M_WAITOK);
mtx_lock(&xs_state.reply_lock);
- error = xb_read(&msg->hdr, sizeof(msg->hdr), &xs_state.reply_lock.lock_object);
+ error = xb_read(&msg->hdr, sizeof(msg->hdr),
+ &xs_state.reply_lock.lock_object);
mtx_unlock(&xs_state.reply_lock);
if (error) {
free(msg, M_DEVBUF);
@@ -800,7 +800,8 @@ xs_process_msg(enum xsd_sockmsg_type *type)
body = malloc(msg->hdr.len + 1, M_DEVBUF, M_WAITOK);
mtx_lock(&xs_state.reply_lock);
- error = xb_read(body, msg->hdr.len, &xs_state.reply_lock.lock_object);
+ error = xb_read(body, msg->hdr.len,
+ &xs_state.reply_lock.lock_object);
mtx_unlock(&xs_state.reply_lock);
if (error) {
free(body, M_DEVBUF);