aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wemm <peter@FreeBSD.org>2002-07-12 07:56:11 +0000
committerPeter Wemm <peter@FreeBSD.org>2002-07-12 07:56:11 +0000
commitf1b665c8fe3c87d9baf30ea71abacecb5345238c (patch)
tree8b641ea8c1ce08eac805ae6a630e6c6139bca09e
parent90833c99de4bdef6b8446b1b3dbea61a86c56aa9 (diff)
downloadsrc-f1b665c8fe3c87d9baf30ea71abacecb5345238c.tar.gz
src-f1b665c8fe3c87d9baf30ea71abacecb5345238c.zip
Revive backed out pmap related changes from Feb 2002. The highlights are:
- It actually works this time, honest! - Fine grained TLB shootdowns for SMP on i386. IPI's are very expensive, so try and optimize things where possible. - Introduce ranged shootdowns that can be done as a single IPI. - PG_G support for i386 - Specific-cpu targeted shootdowns. For example, there is no sense in globally purging the TLB cache for where we are stealing a page from the local unshared process on the local cpu. Use pm_active to track this. - Add some instrumentation for the tlb shootdown code. - Rip out SMP code from <machine/cpufunc.h> - Try and fix some very bogus PG_G and PG_PS interactions that were bad enough to cause vm86 bios calls to break. vm86 depended on our existing bugs and this was the cause of the VESA panics last time. - Fix the silly one-line error that caused the 'panic: bad pte' last time. - Fix a couple of other silly one-line errors that should have caused more pain than they did. Some more work is needed: - pmap_{zero,copy}_page[_idle]. These can be done without IPI's if we have a hook in cpu_switch. - The IPI handlers need some cleanup. I have a bogus %ds load that can be avoided. - APTD handling is rather bogus and appears to be a large source of global TLB IPI shootdowns for no really good reason. I see speedups of between 1.5% and ~4% on buildworlds in a while 1 loop. I expect to see a bigger difference when there is significant pageout activity or the system otherwise has memory shortages. I have backed out a few optimizations that I had been using over the last few days in order to be a little more conservative. I'll revisit these again over the next few days as the dust settles. New option: DISABLE_PG_G - In case I missed something.
Notes
Notes: svn path=/head/; revision=99862
-rw-r--r--sys/amd64/amd64/apic_vector.S87
-rw-r--r--sys/amd64/amd64/bios.c15
-rw-r--r--sys/amd64/amd64/db_interface.c4
-rw-r--r--sys/amd64/amd64/locore.S24
-rw-r--r--sys/amd64/amd64/locore.s24
-rw-r--r--sys/amd64/amd64/mp_machdep.c252
-rw-r--r--sys/amd64/amd64/mptable.c252
-rw-r--r--sys/amd64/amd64/pmap.c510
-rw-r--r--sys/amd64/amd64/support.S36
-rw-r--r--sys/amd64/amd64/support.s36
-rw-r--r--sys/amd64/include/cpufunc.h168
-rw-r--r--sys/amd64/include/mptable.h252
-rw-r--r--sys/amd64/include/pmap.h9
-rw-r--r--sys/amd64/include/smp.h10
-rw-r--r--sys/amd64/isa/intr_machdep.h20
-rw-r--r--sys/conf/options.i3863
-rw-r--r--sys/conf/options.pc983
-rw-r--r--sys/i386/i386/apic_vector.s87
-rw-r--r--sys/i386/i386/bios.c15
-rw-r--r--sys/i386/i386/db_interface.c4
-rw-r--r--sys/i386/i386/locore.s24
-rw-r--r--sys/i386/i386/mp_machdep.c252
-rw-r--r--sys/i386/i386/mpapic.c3
-rw-r--r--sys/i386/i386/mptable.c252
-rw-r--r--sys/i386/i386/pmap.c510
-rw-r--r--sys/i386/i386/support.s36
-rw-r--r--sys/i386/i386/vm86.c2
-rw-r--r--sys/i386/include/cpufunc.h168
-rw-r--r--sys/i386/include/mptable.h252
-rw-r--r--sys/i386/include/pmap.h9
-rw-r--r--sys/i386/include/smp.h10
-rw-r--r--sys/i386/isa/apic_vector.s87
-rw-r--r--sys/i386/isa/intr_machdep.h20
-rw-r--r--sys/kern/subr_witness.c3
34 files changed, 2426 insertions, 1013 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 8490b1b14ca5..569ed501a468 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -260,30 +260,107 @@ Xspuriousint:
iret
/*
- * Handle TLB shootdowns.
+ * Global address space TLB shootdown.
*/
.text
SUPERALIGN_TEXT
.globl Xinvltlb
Xinvltlb:
pushl %eax
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
#ifdef COUNT_XINVLTLB_HITS
pushl %fs
- movl $KPSEL, %eax
+ movl $KPSEL, %eax /* Private space selector */
mov %ax, %fs
movl PCPU(CPUID), %eax
popl %fs
- ss
- incl xhits(,%eax,4)
+ incl xhits_gbl(,%eax,4)
#endif /* COUNT_XINVLTLB_HITS */
movl %cr3, %eax /* invalidate the TLB */
movl %eax, %cr3
- ss /* stack segment, avoid %ds load */
movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %eax
+ iret
+
+/*
+ * Single page TLB shootdown
+ */
+ .text
+ SUPERALIGN_TEXT
+ .globl Xinvlpg
+Xinvlpg:
+ pushl %eax
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+ pushl %fs
+ movl $KPSEL, %eax /* Private space selector */
+ mov %ax, %fs
+ movl PCPU(CPUID), %eax
+ popl %fs
+ incl xhits_pg(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+ movl smp_tlb_addr1, %eax
+ invlpg (%eax) /* invalidate single page */
+
+ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %eax
+ iret
+
+/*
+ * Page range TLB shootdown.
+ */
+ .text
+ SUPERALIGN_TEXT
+ .globl Xinvlrng
+Xinvlrng:
+ pushl %eax
+ pushl %edx
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+ pushl %fs
+ movl $KPSEL, %eax /* Private space selector */
+ mov %ax, %fs
+ movl PCPU(CPUID), %eax
+ popl %fs
+ incl xhits_rng(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+ movl smp_tlb_addr1, %edx
+ movl smp_tlb_addr2, %eax
+1: invlpg (%edx) /* invalidate single page */
+ addl $PAGE_SIZE, %edx
+ cmpl %edx, %eax
+ jb 1b
+
+ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %edx
popl %eax
iret
diff --git a/sys/amd64/amd64/bios.c b/sys/amd64/amd64/bios.c
index 0312adf00bff..6e0837cb89b6 100644
--- a/sys/amd64/amd64/bios.c
+++ b/sys/amd64/amd64/bios.c
@@ -323,7 +323,8 @@ bios16(struct bios_args *args, char *fmt, ...)
va_list ap;
int flags = BIOSCODE_FLAG | BIOSDATA_FLAG;
u_int i, arg_start, arg_end;
- u_int *pte, *ptd;
+ pt_entry_t *pte;
+ pd_entry_t *ptd;
arg_start = 0xffffffff;
arg_end = 0;
@@ -382,19 +383,19 @@ bios16(struct bios_args *args, char *fmt, ...)
args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME;
args->seg.code32.limit = 0xffff;
- ptd = (u_int *)rcr3();
+ ptd = (pd_entry_t *)rcr3();
if (ptd == (u_int *)IdlePTD) {
/*
* no page table, so create one and install it.
*/
- pte = (u_int *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
- ptd = (u_int *)((u_int)ptd + KERNBASE);
+ pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
+ ptd = (pd_entry_t *)((u_int)ptd + KERNBASE);
*ptd = vtophys(pte) | PG_RW | PG_V;
} else {
/*
* this is a user-level page table
*/
- pte = (u_int *)&PTmap;
+ pte = PTmap;
}
/*
* install pointer to page 0. we don't need to flush the tlb,
@@ -451,7 +452,7 @@ bios16(struct bios_args *args, char *fmt, ...)
i = bios16_call(&args->r, stack_top);
- if (pte == (u_int *)&PTmap) {
+ if (pte == PTmap) {
*pte = 0; /* remove entry */
} else {
*ptd = 0; /* remove page table */
@@ -461,7 +462,7 @@ bios16(struct bios_args *args, char *fmt, ...)
/*
* XXX only needs to be invlpg(0) but that doesn't work on the 386
*/
- invltlb();
+ pmap_invalidate_all(kernel_pmap);
return (i);
}
diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c
index 2ba81daeefee..ec32a58a14fb 100644
--- a/sys/amd64/amd64/db_interface.c
+++ b/sys/amd64/amd64/db_interface.c
@@ -276,7 +276,7 @@ db_write_bytes(addr, size, data)
}
}
- invltlb();
+ pmap_invalidate_all(kernel_pmap);
}
dst = (char *)addr;
@@ -292,7 +292,7 @@ db_write_bytes(addr, size, data)
if (ptep1)
*ptep1 = oldmap1;
- invltlb();
+ pmap_invalidate_all(kernel_pmap);
}
}
diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S
index d06065da0524..94a3a103ac03 100644
--- a/sys/amd64/amd64/locore.S
+++ b/sys/amd64/amd64/locore.S
@@ -127,6 +127,7 @@ HIDENAME(tmpstk):
.globl bootinfo
bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */
+ .globl KERNend
KERNend: .long 0 /* phys addr end of kernel (just after bss) */
physfree: .long 0 /* phys addr of next free page */
@@ -381,12 +382,6 @@ begin:
movl IdlePTD,%esi
movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax)
- testl $CPUID_PGE, R(cpu_feature)
- jz 1f
- movl %cr4, %eax
- orl $CR4_PGE, %eax
- movl %eax, %cr4
-1:
pushl physfree /* value of first for init386(first) */
call init386 /* wire 386 chip for unix operation */
@@ -809,14 +804,7 @@ no_kernend:
jne map_read_write
#endif
xorl %edx,%edx
-
-#if !defined(SMP)
- testl $CPUID_PGE, R(cpu_feature)
- jz 2f
- orl $PG_G,%edx
-#endif
-
-2: movl $R(etext),%ecx
+ movl $R(etext),%ecx
addl $PAGE_MASK,%ecx
shrl $PAGE_SHIFT,%ecx
fillkptphys(%edx)
@@ -827,13 +815,7 @@ no_kernend:
andl $~PAGE_MASK, %eax
map_read_write:
movl $PG_RW,%edx
-#if !defined(SMP)
- testl $CPUID_PGE, R(cpu_feature)
- jz 1f
- orl $PG_G,%edx
-#endif
-
-1: movl R(KERNend),%ecx
+ movl R(KERNend),%ecx
subl %eax,%ecx
shrl $PAGE_SHIFT,%ecx
fillkptphys(%edx)
diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s
index d06065da0524..94a3a103ac03 100644
--- a/sys/amd64/amd64/locore.s
+++ b/sys/amd64/amd64/locore.s
@@ -127,6 +127,7 @@ HIDENAME(tmpstk):
.globl bootinfo
bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */
+ .globl KERNend
KERNend: .long 0 /* phys addr end of kernel (just after bss) */
physfree: .long 0 /* phys addr of next free page */
@@ -381,12 +382,6 @@ begin:
movl IdlePTD,%esi
movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax)
- testl $CPUID_PGE, R(cpu_feature)
- jz 1f
- movl %cr4, %eax
- orl $CR4_PGE, %eax
- movl %eax, %cr4
-1:
pushl physfree /* value of first for init386(first) */
call init386 /* wire 386 chip for unix operation */
@@ -809,14 +804,7 @@ no_kernend:
jne map_read_write
#endif
xorl %edx,%edx
-
-#if !defined(SMP)
- testl $CPUID_PGE, R(cpu_feature)
- jz 2f
- orl $PG_G,%edx
-#endif
-
-2: movl $R(etext),%ecx
+ movl $R(etext),%ecx
addl $PAGE_MASK,%ecx
shrl $PAGE_SHIFT,%ecx
fillkptphys(%edx)
@@ -827,13 +815,7 @@ no_kernend:
andl $~PAGE_MASK, %eax
map_read_write:
movl $PG_RW,%edx
-#if !defined(SMP)
- testl $CPUID_PGE, R(cpu_feature)
- jz 1f
- orl $PG_G,%edx
-#endif
-
-1: movl R(KERNend),%ecx
+ movl R(KERNend),%ecx
subl %eax,%ecx
shrl $PAGE_SHIFT,%ecx
fillkptphys(%edx)
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 63fec0e9e9c9..29e9c6eb56fb 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt;
struct pcb stoppcbs[MAXCPU];
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
/*
* Local data and functions.
*/
@@ -336,6 +344,9 @@ init_locks(void)
#ifdef USE_COMLOCK
mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
#endif /* USE_COMLOCK */
+#ifdef APIC_IO
+ mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
+#endif
}
/*
@@ -605,6 +616,10 @@ mp_enable(u_int boot_addr)
/* install an inter-CPU IPI for TLB invalidation */
setidt(XINVLTLB_OFFSET, Xinvltlb,
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLPG_OFFSET, Xinvlpg,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLRNG_OFFSET, Xinvlrng,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
/* install an inter-CPU IPI for forwarding hardclock() */
setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS)
-u_int xhits[MAXCPU];
-SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits),
- "IU", "");
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
#endif
/*
* Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+ register_t eflags;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+/*
+ * This is about as magic as it gets. fortune(1) has got similar code
+ * for reversing bits in a word. Who thinks up this stuff??
+ *
+ * Yes, it does appear to be consistently faster than:
+ * while (i = ffs(m)) {
+ * m >>= i;
+ * bits++;
+ * }
+ * and
+ * while (lsb = (m & -m)) { // This is magic too
+ * m &= ~lsb; // or: m ^= lsb
+ * bits++;
+ * }
+ * Both of these latter forms do some very strange things on gcc-3.1 with
+ * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
+ * There is probably an SSE or MMX popcnt instruction.
*
- * XXX: Needs to handshake and wait for completion before proceding.
+ * I wonder if this should be in libkern?
+ *
+ * XXX Stop the presses! Another one:
+ * static __inline u_int32_t
+ * popcnt1(u_int32_t v)
+ * {
+ * v -= ((v >> 1) & 0x55555555);
+ * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ * v = (v + (v >> 4)) & 0x0F0F0F0F;
+ * return (v * 0x01010101) >> 24;
+ * }
+ * The downside is that it has a multiply. With a pentium3 with
+ * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
+ * an imull, and in that case it is faster. In most other cases
+ * it appears slightly slower.
*/
+static __inline u_int32_t
+popcnt(u_int32_t m)
+{
+
+ m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
+ m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
+ m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
+ m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
+ m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
+ return m;
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int ncpu, othercpus;
+ register_t eflags;
+
+ othercpus = mp_ncpus - 1;
+ if (mask == (u_int)-1) {
+ ncpu = othercpus;
+ if (ncpu < 1)
+ return;
+ } else {
+ /* XXX there should be a pcpu self mask */
+ mask &= ~(1 << PCPU_GET(cpuid));
+ if (mask == 0)
+ return;
+ ncpu = popcnt(mask);
+ if (ncpu > othercpus) {
+ /* XXX this should be a panic offence */
+ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+ ncpu, othercpus);
+ ncpu = othercpus;
+ }
+ /* XXX should be a panic, implied by mask == 0 above */
+ if (ncpu < 1)
+ return;
+ }
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (mask == (u_int)-1)
+ ipi_all_but_self(vector);
+ else
+ ipi_selected(mask, vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
void
smp_invltlb(void)
{
#if defined(APIC_IO)
- if (smp_started)
- ipi_all_but_self(IPI_INVLTLB);
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
#endif /* APIC_IO */
}
void
-invlpg(u_int addr)
+smp_invlpg(vm_offset_t addr)
{
- __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
{
- u_long temp;
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3() is
- * inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, vm_offset_t addr)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
@@ -2251,7 +2455,7 @@ ap_init(void)
/* spin */ ;
/* BSP may have changed PTD while we were waiting */
- cpu_invltlb();
+ invltlb();
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -2290,6 +2494,9 @@ ap_init(void)
/* Build our map of 'other' CPUs. */
PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
+ if (bootverbose)
+ apic_dump("ap_init()");
+
printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
if (smp_cpus == mp_ncpus) {
@@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame)
{
mtx_lock_spin(&sched_lock);
- statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+ statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+ TRAPF_USERMODE(&frame));
mtx_unlock_spin(&sched_lock);
}
diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c
index 63fec0e9e9c9..29e9c6eb56fb 100644
--- a/sys/amd64/amd64/mptable.c
+++ b/sys/amd64/amd64/mptable.c
@@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt;
struct pcb stoppcbs[MAXCPU];
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
/*
* Local data and functions.
*/
@@ -336,6 +344,9 @@ init_locks(void)
#ifdef USE_COMLOCK
mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
#endif /* USE_COMLOCK */
+#ifdef APIC_IO
+ mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
+#endif
}
/*
@@ -605,6 +616,10 @@ mp_enable(u_int boot_addr)
/* install an inter-CPU IPI for TLB invalidation */
setidt(XINVLTLB_OFFSET, Xinvltlb,
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLPG_OFFSET, Xinvlpg,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLRNG_OFFSET, Xinvlrng,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
/* install an inter-CPU IPI for forwarding hardclock() */
setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS)
-u_int xhits[MAXCPU];
-SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits),
- "IU", "");
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
#endif
/*
* Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+ register_t eflags;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+/*
+ * This is about as magic as it gets. fortune(1) has got similar code
+ * for reversing bits in a word. Who thinks up this stuff??
+ *
+ * Yes, it does appear to be consistently faster than:
+ * while (i = ffs(m)) {
+ * m >>= i;
+ * bits++;
+ * }
+ * and
+ * while (lsb = (m & -m)) { // This is magic too
+ * m &= ~lsb; // or: m ^= lsb
+ * bits++;
+ * }
+ * Both of these latter forms do some very strange things on gcc-3.1 with
+ * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
+ * There is probably an SSE or MMX popcnt instruction.
*
- * XXX: Needs to handshake and wait for completion before proceding.
+ * I wonder if this should be in libkern?
+ *
+ * XXX Stop the presses! Another one:
+ * static __inline u_int32_t
+ * popcnt1(u_int32_t v)
+ * {
+ * v -= ((v >> 1) & 0x55555555);
+ * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ * v = (v + (v >> 4)) & 0x0F0F0F0F;
+ * return (v * 0x01010101) >> 24;
+ * }
+ * The downside is that it has a multiply. With a pentium3 with
+ * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
+ * an imull, and in that case it is faster. In most other cases
+ * it appears slightly slower.
*/
+static __inline u_int32_t
+popcnt(u_int32_t m)
+{
+
+ m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
+ m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
+ m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
+ m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
+ m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
+ return m;
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int ncpu, othercpus;
+ register_t eflags;
+
+ othercpus = mp_ncpus - 1;
+ if (mask == (u_int)-1) {
+ ncpu = othercpus;
+ if (ncpu < 1)
+ return;
+ } else {
+ /* XXX there should be a pcpu self mask */
+ mask &= ~(1 << PCPU_GET(cpuid));
+ if (mask == 0)
+ return;
+ ncpu = popcnt(mask);
+ if (ncpu > othercpus) {
+ /* XXX this should be a panic offence */
+ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+ ncpu, othercpus);
+ ncpu = othercpus;
+ }
+ /* XXX should be a panic, implied by mask == 0 above */
+ if (ncpu < 1)
+ return;
+ }
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (mask == (u_int)-1)
+ ipi_all_but_self(vector);
+ else
+ ipi_selected(mask, vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
void
smp_invltlb(void)
{
#if defined(APIC_IO)
- if (smp_started)
- ipi_all_but_self(IPI_INVLTLB);
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
#endif /* APIC_IO */
}
void
-invlpg(u_int addr)
+smp_invlpg(vm_offset_t addr)
{
- __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
{
- u_long temp;
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3() is
- * inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, vm_offset_t addr)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
@@ -2251,7 +2455,7 @@ ap_init(void)
/* spin */ ;
/* BSP may have changed PTD while we were waiting */
- cpu_invltlb();
+ invltlb();
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -2290,6 +2494,9 @@ ap_init(void)
/* Build our map of 'other' CPUs. */
PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
+ if (bootverbose)
+ apic_dump("ap_init()");
+
printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
if (smp_cpus == mp_ncpus) {
@@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame)
{
mtx_lock_spin(&sched_lock);
- statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+ statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+ TRAPF_USERMODE(&frame));
mtx_unlock_spin(&sched_lock);
}
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 87cd8b9ceb94..5de170703aef 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -68,7 +68,6 @@
* and to when physical maps must be made correct.
*/
-#include "opt_disable_pse.h"
#include "opt_pmap.h"
#include "opt_msgbuf.h"
#include "opt_kstack_pages.h"
@@ -85,6 +84,9 @@
#include <sys/user.h>
#include <sys/vmmeter.h>
#include <sys/sysctl.h>
+#ifdef SMP
+#include <sys/smp.h>
+#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -97,6 +99,7 @@
#include <vm/vm_pager.h>
#include <vm/uma.h>
+#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
@@ -162,6 +165,7 @@ static vm_object_t kptobj;
static int nkpt;
vm_offset_t kernel_vm_end;
+extern u_int32_t KERNend;
/*
* Data for the pv entry allocation mechanism
@@ -257,10 +261,10 @@ static vm_offset_t
pmap_kmem_choose(vm_offset_t addr)
{
vm_offset_t newaddr = addr;
+
#ifndef DISABLE_PSE
- if (cpu_feature & CPUID_PSE) {
+ if (cpu_feature & CPUID_PSE)
newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
- }
#endif
return newaddr;
}
@@ -362,10 +366,9 @@ pmap_bootstrap(firstaddr, loadaddr)
PTD[i] = 0;
pgeflag = 0;
-#if !defined(SMP) /* XXX - see also mp_machdep.c */
- if (cpu_feature & CPUID_PGE) {
+#ifndef DISABLE_PG_G
+ if (cpu_feature & CPUID_PGE)
pgeflag = PG_G;
- }
#endif
/*
@@ -378,7 +381,7 @@ pmap_bootstrap(firstaddr, loadaddr)
*/
pdir4mb = 0;
-#if !defined(DISABLE_PSE)
+#ifndef DISABLE_PSE
if (cpu_feature & CPUID_PSE) {
pd_entry_t ptditmp;
/*
@@ -389,29 +392,16 @@ pmap_bootstrap(firstaddr, loadaddr)
ptditmp &= ~(NBPDR - 1);
ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
pdir4mb = ptditmp;
-
-#if !defined(SMP)
- /*
- * Enable the PSE mode.
- */
- load_cr4(rcr4() | CR4_PSE);
-
- /*
- * We can do the mapping here for the single processor
- * case. We simply ignore the old page table page from
- * now on.
- */
- /*
- * For SMP, we still need 4K pages to bootstrap APs,
- * PSE will be enabled as soon as all APs are up.
- */
- PTD[KPTDI] = (pd_entry_t) ptditmp;
- kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
- invltlb();
-#endif
}
#endif
-
+#ifndef SMP
+ /*
+ * Turn on PGE/PSE. SMP does this later on since the
+ * 4K page tables are required for AP boot (for now).
+ * XXX fixme.
+ */
+ pmap_set_opt();
+#endif
#ifdef SMP
if (cpu_apic_address == 0)
panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
@@ -420,26 +410,55 @@ pmap_bootstrap(firstaddr, loadaddr)
SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
(cpu_apic_address & PG_FRAME));
#endif
-
invltlb();
}
-#ifdef SMP
/*
- * Set 4mb pdir for mp startup
+ * Enable 4MB page mode for MP startup. Turn on PG_G support.
+ * BSP will run this after all the AP's have started up.
*/
void
pmap_set_opt(void)
{
+ pt_entry_t *pte;
+ vm_offset_t va, endva;
+
+ if (pgeflag && (cpu_feature & CPUID_PGE)) {
+ load_cr4(rcr4() | CR4_PGE);
+ invltlb(); /* Insurance */
+ }
+#ifndef DISABLE_PSE
if (pseflag && (cpu_feature & CPUID_PSE)) {
load_cr4(rcr4() | CR4_PSE);
- if (pdir4mb && PCPU_GET(cpuid) == 0) { /* only on BSP */
+ invltlb(); /* Insurance */
+ }
+#endif
+ if (PCPU_GET(cpuid) == 0) {
+#ifndef DISABLE_PSE
+ if (pdir4mb) {
kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
- cpu_invltlb();
+ invltlb(); /* Insurance */
}
+#endif
+ if (pgeflag) {
+ /* Turn on PG_G for text, data, bss pages. */
+ va = (vm_offset_t)btext;
+ endva = KERNBASE + KERNend;
+ while (va < endva) {
+ pte = vtopte(va);
+ if (*pte)
+ *pte |= pgeflag;
+ va += PAGE_SIZE;
+ }
+ invltlb(); /* Insurance */
+ }
+ /*
+ * We do not need to broadcast the invltlb here, because
+ * each AP does it the moment it is released from the boot
+ * lock. See ap_init().
+ */
}
}
-#endif
void *
pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
@@ -553,43 +572,151 @@ pmap_track_modified(vm_offset_t va)
return 0;
}
-static PMAP_INLINE void
-invltlb_1pg(vm_offset_t va)
-{
#ifdef I386_CPU
- invltlb();
-#else
- invlpg(va);
-#endif
+/*
+ * i386 only has "invalidate everything" and no SMP to worry about.
+ */
+PMAP_INLINE void
+pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invltlb();
}
-static __inline void
+PMAP_INLINE void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invltlb();
+}
+
+PMAP_INLINE void
+pmap_invalidate_all(pmap_t pmap)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invltlb();
+}
+#else /* !I386_CPU */
+#ifdef SMP
+/*
+ * For SMP, these functions have to use the IPI mechanism for coherence.
+ */
+void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
-#if defined(SMP)
- if (pmap->pm_active & PCPU_GET(cpumask))
- cpu_invlpg((void *)va);
- if (pmap->pm_active & PCPU_GET(other_cpus))
- smp_invltlb();
-#else
- if (pmap->pm_active)
- invltlb_1pg(va);
-#endif
+ u_int cpumask;
+ u_int other_cpus;
+
+ critical_enter();
+ /*
+ * We need to disable interrupt preemption but MUST NOT have
+ * interrupts disabled here.
+ * XXX we may need to hold schedlock to get a coherent pm_active
+ */
+ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+ invlpg(va);
+ smp_invlpg(va);
+ } else {
+ cpumask = PCPU_GET(cpumask);
+ other_cpus = PCPU_GET(other_cpus);
+ if (pmap->pm_active & cpumask)
+ invlpg(va);
+ if (pmap->pm_active & other_cpus)
+ smp_masked_invlpg(pmap->pm_active & other_cpus, va);
+ }
+ critical_exit();
}
-static __inline void
+void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ u_int cpumask;
+ u_int other_cpus;
+ vm_offset_t addr;
+
+ critical_enter();
+ /*
+ * We need to disable interrupt preemption but MUST NOT have
+ * interrupts disabled here.
+ * XXX we may need to hold schedlock to get a coherent pm_active
+ */
+ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ smp_invlpg_range(sva, eva);
+ } else {
+ cpumask = PCPU_GET(cpumask);
+ other_cpus = PCPU_GET(other_cpus);
+ if (pmap->pm_active & cpumask)
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ if (pmap->pm_active & other_cpus)
+ smp_masked_invlpg_range(pmap->pm_active & other_cpus,
+ sva, eva);
+ }
+ critical_exit();
+}
+
+void
pmap_invalidate_all(pmap_t pmap)
{
-#if defined(SMP)
- if (pmap->pm_active & PCPU_GET(cpumask))
- cpu_invltlb();
- if (pmap->pm_active & PCPU_GET(other_cpus))
+ u_int cpumask;
+ u_int other_cpus;
+
+ critical_enter();
+ /*
+ * We need to disable interrupt preemption but MUST NOT have
+ * interrupts disabled here.
+ * XXX we may need to hold schedlock to get a coherent pm_active
+ */
+ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+ invltlb();
smp_invltlb();
-#else
- if (pmap->pm_active)
+ } else {
+ cpumask = PCPU_GET(cpumask);
+ other_cpus = PCPU_GET(other_cpus);
+ if (pmap->pm_active & cpumask)
+ invltlb();
+ if (pmap->pm_active & other_cpus)
+ smp_masked_invltlb(pmap->pm_active & other_cpus);
+ }
+ critical_exit();
+}
+#else /* !SMP */
+/*
+ * Normal, non-SMP, 486+ invalidation functions.
+ * We inline these within pmap.c for speed.
+ */
+PMAP_INLINE void
+pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invlpg(va);
+}
+
+PMAP_INLINE void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ vm_offset_t addr;
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+}
+
+PMAP_INLINE void
+pmap_invalidate_all(pmap_t pmap)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
invltlb();
-#endif
}
+#endif /* !SMP */
+#endif /* !I386_CPU */
/*
* Return an address which is the base of the Virtual mapping of
@@ -613,12 +740,7 @@ get_ptbase(pmap)
/* otherwise, we are alternate address space */
if (frame != (APTDpde & PG_FRAME)) {
APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
-#if defined(SMP)
- /* The page directory is not shared between CPUs */
- cpu_invltlb();
-#else
invltlb();
-#endif
}
return APTmap;
}
@@ -647,7 +769,7 @@ pmap_pte_quick(pmap, va)
newpf = pde & PG_FRAME;
if (((*PMAP1) & PG_FRAME) != newpf) {
*PMAP1 = newpf | PG_RW | PG_V;
- invltlb_1pg((vm_offset_t) PADDR1);
+ pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1);
}
return PADDR1 + (index & (NPTEPG - 1));
}
@@ -692,34 +814,29 @@ pmap_extract(pmap, va)
***************************************************/
/*
- * add a wired page to the kva
- * note that in order for the mapping to take effect -- you
- * should do a invltlb after doing the pmap_kenter...
+ * Add a wired page to the kva.
+ * Note: not SMP coherent.
*/
PMAP_INLINE void
pmap_kenter(vm_offset_t va, vm_offset_t pa)
{
pt_entry_t *pte;
- pt_entry_t npte, opte;
- npte = pa | PG_RW | PG_V | pgeflag;
pte = vtopte(va);
- opte = *pte;
- *pte = npte;
- invltlb_1pg(va);
+ *pte = pa | PG_RW | PG_V | pgeflag;
}
/*
- * remove a page from the kernel pagetables
+ * Remove a page from the kernel pagetables.
+ * Note: not SMP coherent.
*/
PMAP_INLINE void
pmap_kremove(vm_offset_t va)
{
- register pt_entry_t *pte;
+ pt_entry_t *pte;
pte = vtopte(va);
*pte = 0;
- invltlb_1pg(va);
}
/*
@@ -737,13 +854,15 @@ pmap_kremove(vm_offset_t va)
vm_offset_t
pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
{
- vm_offset_t sva = *virt;
- vm_offset_t va = sva;
+ vm_offset_t va, sva;
+
+ va = sva = *virt;
while (start < end) {
pmap_kenter(va, start);
va += PAGE_SIZE;
start += PAGE_SIZE;
}
+ pmap_invalidate_range(kernel_pmap, sva, va);
*virt = va;
return (sva);
}
@@ -756,64 +875,45 @@ pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
* page modification or references recorded.
* Note that old mappings are simply written
* over. The page *must* be wired.
+ * Note: SMP coherent. Uses a ranged shootdown IPI.
*/
void
-pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
+pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
{
- vm_offset_t end_va;
-
- end_va = va + count * PAGE_SIZE;
-
- while (va < end_va) {
- pt_entry_t *pte;
+ vm_offset_t va;
- pte = vtopte(va);
- *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
-#ifdef SMP
- cpu_invlpg((void *)va);
-#else
- invltlb_1pg(va);
-#endif
+ va = sva;
+ while (count-- > 0) {
+ pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
va += PAGE_SIZE;
m++;
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, sva, va);
}
/*
- * this routine jerks page mappings from the
+ * This routine tears out page mappings from the
* kernel -- it is meant only for temporary mappings.
+ * Note: SMP coherent. Uses a ranged shootdown IPI.
*/
void
-pmap_qremove(vm_offset_t va, int count)
+pmap_qremove(vm_offset_t sva, int count)
{
- vm_offset_t end_va;
-
- end_va = va + count*PAGE_SIZE;
-
- while (va < end_va) {
- pt_entry_t *pte;
+ vm_offset_t va;
- pte = vtopte(va);
- *pte = 0;
-#ifdef SMP
- cpu_invlpg((void *)va);
-#else
- invltlb_1pg(va);
-#endif
+ va = sva;
+ while (count-- > 0) {
+ pmap_kremove(va);
va += PAGE_SIZE;
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, sva, va);
}
static vm_page_t
pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
{
vm_page_t m;
+
retry:
m = vm_page_lookup(object, pindex);
if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
@@ -829,14 +929,11 @@ retry:
void
pmap_new_thread(struct thread *td)
{
-#ifdef I386_CPU
- int updateneeded = 0;
-#endif
int i;
+ vm_page_t ma[KSTACK_PAGES];
vm_object_t ksobj;
vm_page_t m;
vm_offset_t ks;
- pt_entry_t *ptek, oldpte;
/*
* allocate object for the kstack
@@ -844,39 +941,21 @@ pmap_new_thread(struct thread *td)
ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES);
td->td_kstack_obj = ksobj;
-#ifdef KSTACK_GUARD
/* get a kernel virtual address for the kstack for this thread */
+#ifdef KSTACK_GUARD
ks = kmem_alloc_nofault(kernel_map, (KSTACK_PAGES + 1) * PAGE_SIZE);
if (ks == 0)
panic("pmap_new_thread: kstack allocation failed");
-
- /*
- * Set the first page to be the unmapped guard page.
- */
- ptek = vtopte(ks);
- oldpte = *ptek;
- *ptek = 0;
- if (oldpte) {
-#ifdef I386_CPU
- updateneeded = 1;
-#else
- invlpg(ks);
-#endif
- }
-
- /*
- * move to the next page, which is where the real stack starts.
- */
+ if (*vtopte(ks) != 0)
+ pmap_qremove(ks, 1);
ks += PAGE_SIZE;
td->td_kstack = ks;
- ptek++;
#else
/* get a kernel virtual address for the kstack for this thread */
ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE);
if (ks == 0)
panic("pmap_new_thread: kstack allocation failed");
td->td_kstack = ks;
- ptek = vtopte(ks);
#endif
/*
* For the length of the stack, link in a real page of ram for each
@@ -887,6 +966,7 @@ pmap_new_thread(struct thread *td)
* Get a kernel stack page
*/
m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+ ma[i] = m;
/*
* Wire the page
@@ -894,28 +974,12 @@ pmap_new_thread(struct thread *td)
m->wire_count++;
cnt.v_wire_count++;
- /*
- * Enter the page into the kernel address space.
- */
- oldpte = ptek[i];
- ptek[i] = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
- if (oldpte) {
-#ifdef I386_CPU
- updateneeded = 1;
-#else
- invlpg(ks + (i * PAGE_SIZE));
-#endif
- }
-
vm_page_wakeup(m);
vm_page_flag_clear(m, PG_ZERO);
vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
m->valid = VM_PAGE_BITS_ALL;
}
-#ifdef I386_CPU
- if (updateneeded)
- invltlb();
-#endif
+ pmap_qenter(ks, ma, KSTACK_PAGES);
}
/*
@@ -930,26 +994,18 @@ pmap_dispose_thread(td)
vm_object_t ksobj;
vm_offset_t ks;
vm_page_t m;
- pt_entry_t *ptek;
ksobj = td->td_kstack_obj;
ks = td->td_kstack;
- ptek = vtopte(ks);
+ pmap_qremove(ks, KSTACK_PAGES);
for (i = 0; i < KSTACK_PAGES; i++) {
m = vm_page_lookup(ksobj, i);
if (m == NULL)
panic("pmap_dispose_thread: kstack already missing?");
vm_page_busy(m);
- ptek[i] = 0;
-#ifndef I386_CPU
- invlpg(ks + (i * PAGE_SIZE));
-#endif
vm_page_unwire(m, 0);
vm_page_free(m);
}
-#ifdef I386_CPU
- invltlb();
-#endif
/*
* Free the space that this stack was mapped to in the kernel
* address map.
@@ -976,13 +1032,13 @@ pmap_swapout_thread(td)
ksobj = td->td_kstack_obj;
ks = td->td_kstack;
+ pmap_qremove(ks, KSTACK_PAGES);
for (i = 0; i < KSTACK_PAGES; i++) {
m = vm_page_lookup(ksobj, i);
if (m == NULL)
panic("pmap_swapout_thread: kstack already missing?");
vm_page_dirty(m);
vm_page_unwire(m, 0);
- pmap_kremove(ks + i * PAGE_SIZE);
}
}
@@ -994,6 +1050,7 @@ pmap_swapin_thread(td)
struct thread *td;
{
int i, rv;
+ vm_page_t ma[KSTACK_PAGES];
vm_object_t ksobj;
vm_offset_t ks;
vm_page_t m;
@@ -1002,7 +1059,6 @@ pmap_swapin_thread(td)
ks = td->td_kstack;
for (i = 0; i < KSTACK_PAGES; i++) {
m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
- pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
if (m->valid != VM_PAGE_BITS_ALL) {
rv = vm_pager_get_pages(ksobj, &m, 1, 0);
if (rv != VM_PAGER_OK)
@@ -1010,10 +1066,12 @@ pmap_swapin_thread(td)
m = vm_page_lookup(ksobj, i);
m->valid = VM_PAGE_BITS_ALL;
}
+ ma[i] = m;
vm_page_wire(m);
vm_page_wakeup(m);
vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
}
+ pmap_qenter(ks, ma, KSTACK_PAGES);
}
/***************************************************
@@ -1108,7 +1166,8 @@ pmap_pinit0(pmap)
{
pmap->pm_pdir =
(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
- pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
+ pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t)IdlePTD);
+ invlpg((vm_offset_t)pmap->pm_pdir);
pmap->pm_ptphint = NULL;
pmap->pm_active = 0;
TAILQ_INIT(&pmap->pm_pvlist);
@@ -1153,7 +1212,7 @@ pmap_pinit(pmap)
vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
ptdpg->valid = VM_PAGE_BITS_ALL;
- pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
+ pmap_qenter((vm_offset_t) pmap->pm_pdir, &ptdpg, 1);
if ((ptdpg->flags & PG_ZERO) == 0)
bzero(pmap->pm_pdir, PAGE_SIZE);
@@ -1616,7 +1675,7 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
* PG_G.
*/
if (oldpte & PG_G)
- invlpg(va);
+ pmap_invalidate_page(kernel_pmap, va);
pmap->pm_stats.resident_count -= 1;
if (oldpte & PG_MANAGED) {
m = PHYS_TO_VM_PAGE(oldpte);
@@ -2028,13 +2087,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
if ((origpte & PG_RW) == 0) {
*pte |= PG_RW;
-#ifdef SMP
- cpu_invlpg((void *)va);
- if (pmap->pm_active & PCPU_GET(other_cpus))
- smp_invltlb();
-#else
- invltlb_1pg(va);
-#endif
+ pmap_invalidate_page(pmap, va);
}
return;
}
@@ -2102,13 +2155,7 @@ validate:
if ((origpte & ~(PG_M|PG_A)) != newpte) {
*pte = newpte | PG_A;
/*if (origpte)*/ {
-#ifdef SMP
- cpu_invlpg((void *)va);
- if (pmap->pm_active & PCPU_GET(other_cpus))
- smp_invltlb();
-#else
- invltlb_1pg(va);
-#endif
+ pmap_invalidate_page(pmap, va);
}
}
}
@@ -2222,7 +2269,11 @@ retry:
void *
pmap_kenter_temporary(vm_offset_t pa, int i)
{
- pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
+ vm_offset_t va;
+
+ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
+ pmap_kenter(va, pa);
+ invlpg(va);
return ((void *)crashdumpmap);
}
@@ -2527,7 +2578,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
vm_offset_t pdnxt;
pd_entry_t src_frame, dst_frame;
vm_page_t m;
- pd_entry_t saved_pde;
if (dst_addr != src_addr)
return;
@@ -2537,17 +2587,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
return;
dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
- if (dst_frame != (APTDpde & PG_FRAME)) {
- APTDpde = dst_frame | PG_RW | PG_V;
-#if defined(SMP)
- /* The page directory is not shared between CPUs */
- cpu_invltlb();
-#else
- invltlb();
-#endif
- }
- saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V);
- for(addr = src_addr; addr < end_addr; addr = pdnxt) {
+ for (addr = src_addr; addr < end_addr; addr = pdnxt) {
pt_entry_t *src_pte, *dst_pte;
vm_page_t dstmpte, srcmpte;
pd_entry_t srcptepaddr;
@@ -2588,6 +2628,14 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
if (pdnxt > end_addr)
pdnxt = end_addr;
+ /*
+ * Have to recheck this before every avtopte() call below
+ * in case we have blocked and something else used APTDpde.
+ */
+ if (dst_frame != (APTDpde & PG_FRAME)) {
+ APTDpde = dst_frame | PG_RW | PG_V;
+ invltlb();
+ }
src_pte = vtopte(addr);
dst_pte = avtopte(addr);
while (addr < pdnxt) {
@@ -2603,16 +2651,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
* block.
*/
dstmpte = pmap_allocpte(dst_pmap, addr);
- if ((APTDpde & PG_FRAME) !=
- (saved_pde & PG_FRAME)) {
- APTDpde = saved_pde;
-printf ("IT HAPPENNED!");
-#if defined(SMP)
- cpu_invltlb();
-#else
- invltlb();
-#endif
- }
if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
/*
* Clear the modified and
@@ -2644,14 +2682,13 @@ printf ("IT HAPPENNED!");
void
pmap_zero_page(vm_page_t m)
{
- vm_offset_t phys = VM_PAGE_TO_PHYS(m);
+ vm_offset_t phys;
+ phys = VM_PAGE_TO_PHYS(m);
if (*CMAP2)
panic("pmap_zero_page: CMAP2 busy");
-
*CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M;
- invltlb_1pg((vm_offset_t)CADDR2);
-
+ pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2);
#if defined(I686_CPU)
if (cpu_class == CPUCLASS_686)
i686_pagezero(CADDR2);
@@ -2670,14 +2707,13 @@ pmap_zero_page(vm_page_t m)
void
pmap_zero_page_area(vm_page_t m, int off, int size)
{
- vm_offset_t phys = VM_PAGE_TO_PHYS(m);
+ vm_offset_t phys;
+ phys = VM_PAGE_TO_PHYS(m);
if (*CMAP2)
panic("pmap_zero_page: CMAP2 busy");
-
*CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M;
- invltlb_1pg((vm_offset_t)CADDR2);
-
+ pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2);
#if defined(I686_CPU)
if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
i686_pagezero(CADDR2);
@@ -2696,20 +2732,13 @@ pmap_zero_page_area(vm_page_t m, int off, int size)
void
pmap_zero_page_idle(vm_page_t m)
{
- vm_offset_t phys = VM_PAGE_TO_PHYS(m);
+ vm_offset_t phys;
+ phys = VM_PAGE_TO_PHYS(m);
if (*CMAP3)
panic("pmap_zero_page: CMAP3 busy");
-
*CMAP3 = PG_V | PG_RW | phys | PG_A | PG_M;
-#ifdef SMP
- mtx_lock(&Giant); /* IPI sender not MPSAFE */
-#endif
- invltlb_1pg((vm_offset_t)CADDR3);
-#ifdef SMP
- mtx_unlock(&Giant);
-#endif
-
+ invlpg((vm_offset_t)CADDR3); /* SMP: local cpu only */
#if defined(I686_CPU)
if (cpu_class == CPUCLASS_686)
i686_pagezero(CADDR3);
@@ -2733,18 +2762,15 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
panic("pmap_copy_page: CMAP1 busy");
if (*CMAP2)
panic("pmap_copy_page: CMAP2 busy");
-
*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
-#ifdef I386_CPU
- invltlb();
-#else
- invlpg((u_int)CADDR1);
- invlpg((u_int)CADDR2);
-#endif
-
+ /*
+ * XXX we "know" that CADDR2 immediately follows CADDR1 and use
+ * that to save an IPI on SMP systems.
+ */
+ pmap_invalidate_range(kernel_pmap, (vm_offset_t)CADDR1,
+ (vm_offset_t)CADDR2 + PAGE_SIZE);
bcopy(CADDR1, CADDR2, PAGE_SIZE);
-
*CMAP1 = 0;
*CMAP2 = 0;
}
@@ -3176,18 +3202,11 @@ pmap_mapdev(pa, size)
for (tmpva = va; size > 0; ) {
pte = vtopte(tmpva);
*pte = pa | PG_RW | PG_V | pgeflag;
-#ifdef SMP
- cpu_invlpg((void *)tmpva);
-#else
- invltlb_1pg(tmpva);
-#endif
size -= PAGE_SIZE;
tmpva += PAGE_SIZE;
pa += PAGE_SIZE;
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, va, tmpva);
return ((void *)(va + offset));
}
@@ -3205,15 +3224,8 @@ pmap_unmapdev(va, size)
for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
pte = vtopte(tmpva);
*pte = 0;
-#ifdef SMP
- cpu_invlpg((void *)tmpva);
-#else
- invltlb_1pg(tmpva);
-#endif
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, va, tmpva);
kmem_free(kernel_map, base, size);
}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index c1f38995f135..23c611cfbd25 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -1596,42 +1596,6 @@ ENTRY(ssdtosd)
popl %ebx
ret
-/* load_cr0(cr0) */
-ENTRY(load_cr0)
- movl 4(%esp),%eax
- movl %eax,%cr0
- ret
-
-/* rcr0() */
-ENTRY(rcr0)
- movl %cr0,%eax
- ret
-
-/* rcr3() */
-ENTRY(rcr3)
- movl %cr3,%eax
- ret
-
-/* void load_cr3(caddr_t cr3) */
-ENTRY(load_cr3)
-#ifdef SWTCH_OPTIM_STATS
- incl tlb_flush_count
-#endif
- movl 4(%esp),%eax
- movl %eax,%cr3
- ret
-
-/* rcr4() */
-ENTRY(rcr4)
- movl %cr4,%eax
- ret
-
-/* void load_cr4(caddr_t cr4) */
-ENTRY(load_cr4)
- movl 4(%esp),%eax
- movl %eax,%cr4
- ret
-
/* void reset_dbregs() */
ENTRY(reset_dbregs)
movl $0,%eax
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index c1f38995f135..23c611cfbd25 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -1596,42 +1596,6 @@ ENTRY(ssdtosd)
popl %ebx
ret
-/* load_cr0(cr0) */
-ENTRY(load_cr0)
- movl 4(%esp),%eax
- movl %eax,%cr0
- ret
-
-/* rcr0() */
-ENTRY(rcr0)
- movl %cr0,%eax
- ret
-
-/* rcr3() */
-ENTRY(rcr3)
- movl %cr3,%eax
- ret
-
-/* void load_cr3(caddr_t cr3) */
-ENTRY(load_cr3)
-#ifdef SWTCH_OPTIM_STATS
- incl tlb_flush_count
-#endif
- movl 4(%esp),%eax
- movl %eax,%cr3
- ret
-
-/* rcr4() */
-ENTRY(rcr4)
- movl %cr4,%eax
- ret
-
-/* void load_cr4(caddr_t cr4) */
-ENTRY(load_cr4)
- movl 4(%esp),%eax
- movl %eax,%cr4
- ret
-
/* void reset_dbregs() */
ENTRY(reset_dbregs)
movl $0,%eax
diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index 2e64138de4c7..0896659c864e 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -237,62 +237,6 @@ invd(void)
__asm __volatile("invd");
}
-#if defined(SMP) && defined(_KERNEL)
-
-/*
- * When using APIC IPI's, invlpg() is not simply the invlpg instruction
- * (this is a bug) and the inlining cost is prohibitive since the call
- * executes into the IPI transmission system.
- */
-void invlpg(u_int addr);
-void invltlb(void);
-
-static __inline void
-cpu_invlpg(void *addr)
-{
- __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
-}
-
-static __inline void
-cpu_invltlb(void)
-{
- u_int temp;
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3()
- * is inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
- : : "memory");
-#if defined(SWTCH_OPTIM_STATS)
- ++tlb_flush_count;
-#endif
-}
-
-#else /* !(SMP && _KERNEL) */
-
-static __inline void
-invlpg(u_int addr)
-{
- __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
-}
-
-static __inline void
-invltlb(void)
-{
- u_int temp;
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3()
- * is inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
- : : "memory");
-#ifdef SWTCH_OPTIM_STATS
- ++tlb_flush_count;
-#endif
-}
-
-#endif /* SMP && _KERNEL */
-
static __inline u_short
inw(u_int port)
{
@@ -364,15 +308,6 @@ ia32_pause(void)
}
static __inline u_int
-rcr2(void)
-{
- u_int data;
-
- __asm __volatile("movl %%cr2,%0" : "=r" (data));
- return (data);
-}
-
-static __inline u_int
read_eflags(void)
{
u_int ef;
@@ -426,6 +361,86 @@ wrmsr(u_int msr, u_int64_t newval)
__asm __volatile("wrmsr" : : "A" (newval), "c" (msr));
}
+static __inline void
+load_cr0(u_int data)
+{
+
+ __asm __volatile("movl %0,%%cr0" : : "r" (data));
+}
+
+static __inline u_int
+rcr0(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr0,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline u_int
+rcr2(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr2,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr3(u_int data)
+{
+
+ __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory");
+#if defined(SWTCH_OPTIM_STATS)
+ ++tlb_flush_count;
+#endif
+}
+
+static __inline u_int
+rcr3(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr3,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr4(u_int data)
+{
+ __asm __volatile("movl %0,%%cr4" : : "r" (data));
+}
+
+static __inline u_int
+rcr4(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr4,%0" : "=r" (data));
+ return (data);
+}
+
+/*
+ * Global TLB flush (except for thise for pages marked PG_G)
+ */
+static __inline void
+invltlb(void)
+{
+
+ load_cr3(rcr3());
+}
+
+/*
+ * TLB flush for an individual page (even if it has PG_G).
+ * Only works on 486+ CPUs (i386 does not have PG_G).
+ */
+static __inline void
+invlpg(u_int addr)
+{
+
+ __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
+}
+
static __inline u_int
rfs(void)
{
@@ -587,6 +602,8 @@ intr_restore(register_t eflags)
int breakpoint(void);
u_int bsfl(u_int mask);
u_int bsrl(u_int mask);
+void cpu_invlpg(u_int addr);
+void cpu_invlpg_range(u_int start, u_int end);
void disable_intr(void);
void do_cpuid(u_int ax, u_int *p);
void enable_intr(void);
@@ -597,8 +614,14 @@ void insl(u_int port, void *addr, size_t cnt);
void insw(u_int port, void *addr, size_t cnt);
void invd(void);
void invlpg(u_int addr);
+void invlpg_range(u_int start, u_int end);
void invltlb(void);
u_short inw(u_int port);
+void load_cr0(u_int cr0);
+void load_cr3(u_int cr3);
+void load_cr4(u_int cr4);
+void load_fs(u_int sel);
+void load_gs(u_int sel);
void outb(u_int port, u_char data);
void outl(u_int port, u_int data);
void outsb(u_int port, void *addr, size_t cnt);
@@ -606,7 +629,12 @@ void outsl(u_int port, void *addr, size_t cnt);
void outsw(u_int port, void *addr, size_t cnt);
void outw(u_int port, u_short data);
void ia32_pause(void);
+u_int rcr0(void);
u_int rcr2(void);
+u_int rcr3(void);
+u_int rcr4(void);
+u_int rfs(void);
+u_int rgs(void);
u_int64_t rdmsr(u_int msr);
u_int64_t rdpmc(u_int pmc);
u_int64_t rdtsc(void);
@@ -614,10 +642,6 @@ u_int read_eflags(void);
void wbinvd(void);
void write_eflags(u_int ef);
void wrmsr(u_int msr, u_int64_t newval);
-u_int rfs(void);
-u_int rgs(void);
-void load_fs(u_int sel);
-void load_gs(u_int sel);
u_int rdr0(void);
void load_dr0(u_int dr0);
u_int rdr1(void);
@@ -639,13 +663,7 @@ void intr_restore(register_t ef);
#endif /* __GNUC__ */
-void load_cr0(u_int cr0);
-void load_cr3(u_int cr3);
-void load_cr4(u_int cr4);
void ltr(u_short sel);
-u_int rcr0(void);
-u_int rcr3(void);
-u_int rcr4(void);
void reset_dbregs(void);
__END_DECLS
diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h
index 63fec0e9e9c9..29e9c6eb56fb 100644
--- a/sys/amd64/include/mptable.h
+++ b/sys/amd64/include/mptable.h
@@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt;
struct pcb stoppcbs[MAXCPU];
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
/*
* Local data and functions.
*/
@@ -336,6 +344,9 @@ init_locks(void)
#ifdef USE_COMLOCK
mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
#endif /* USE_COMLOCK */
+#ifdef APIC_IO
+ mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
+#endif
}
/*
@@ -605,6 +616,10 @@ mp_enable(u_int boot_addr)
/* install an inter-CPU IPI for TLB invalidation */
setidt(XINVLTLB_OFFSET, Xinvltlb,
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLPG_OFFSET, Xinvlpg,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLRNG_OFFSET, Xinvlrng,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
/* install an inter-CPU IPI for forwarding hardclock() */
setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS)
-u_int xhits[MAXCPU];
-SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits),
- "IU", "");
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
#endif
/*
* Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+ register_t eflags;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+/*
+ * This is about as magic as it gets. fortune(1) has got similar code
+ * for reversing bits in a word. Who thinks up this stuff??
+ *
+ * Yes, it does appear to be consistently faster than:
+ * while (i = ffs(m)) {
+ * m >>= i;
+ * bits++;
+ * }
+ * and
+ * while (lsb = (m & -m)) { // This is magic too
+ * m &= ~lsb; // or: m ^= lsb
+ * bits++;
+ * }
+ * Both of these latter forms do some very strange things on gcc-3.1 with
+ * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
+ * There is probably an SSE or MMX popcnt instruction.
*
- * XXX: Needs to handshake and wait for completion before proceding.
+ * I wonder if this should be in libkern?
+ *
+ * XXX Stop the presses! Another one:
+ * static __inline u_int32_t
+ * popcnt1(u_int32_t v)
+ * {
+ * v -= ((v >> 1) & 0x55555555);
+ * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ * v = (v + (v >> 4)) & 0x0F0F0F0F;
+ * return (v * 0x01010101) >> 24;
+ * }
+ * The downside is that it has a multiply. With a pentium3 with
+ * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
+ * an imull, and in that case it is faster. In most other cases
+ * it appears slightly slower.
*/
+static __inline u_int32_t
+popcnt(u_int32_t m)
+{
+
+ m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
+ m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
+ m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
+ m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
+ m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
+ return m;
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int ncpu, othercpus;
+ register_t eflags;
+
+ othercpus = mp_ncpus - 1;
+ if (mask == (u_int)-1) {
+ ncpu = othercpus;
+ if (ncpu < 1)
+ return;
+ } else {
+ /* XXX there should be a pcpu self mask */
+ mask &= ~(1 << PCPU_GET(cpuid));
+ if (mask == 0)
+ return;
+ ncpu = popcnt(mask);
+ if (ncpu > othercpus) {
+ /* XXX this should be a panic offence */
+ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+ ncpu, othercpus);
+ ncpu = othercpus;
+ }
+ /* XXX should be a panic, implied by mask == 0 above */
+ if (ncpu < 1)
+ return;
+ }
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (mask == (u_int)-1)
+ ipi_all_but_self(vector);
+ else
+ ipi_selected(mask, vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
void
smp_invltlb(void)
{
#if defined(APIC_IO)
- if (smp_started)
- ipi_all_but_self(IPI_INVLTLB);
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
#endif /* APIC_IO */
}
void
-invlpg(u_int addr)
+smp_invlpg(vm_offset_t addr)
{
- __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
{
- u_long temp;
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3() is
- * inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, vm_offset_t addr)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
@@ -2251,7 +2455,7 @@ ap_init(void)
/* spin */ ;
/* BSP may have changed PTD while we were waiting */
- cpu_invltlb();
+ invltlb();
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -2290,6 +2494,9 @@ ap_init(void)
/* Build our map of 'other' CPUs. */
PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
+ if (bootverbose)
+ apic_dump("ap_init()");
+
printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
if (smp_cpus == mp_ncpus) {
@@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame)
{
mtx_lock_spin(&sched_lock);
- statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+ statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+ TRAPF_USERMODE(&frame));
mtx_unlock_spin(&sched_lock);
}
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index e6ac669aeaa8..e0789fc68f43 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -151,7 +151,7 @@ extern pt_entry_t PTmap[], APTmap[];
extern pd_entry_t PTD[], APTD[];
extern pd_entry_t PTDpde, APTDpde;
-extern pd_entry_t IdlePTD; /* physical address of "Idle" state directory */
+extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */
#endif
#ifdef _KERNEL
@@ -253,14 +253,15 @@ extern char *ptvmmap; /* poor name! */
extern vm_offset_t virtual_avail;
extern vm_offset_t virtual_end;
-void pmap_bootstrap( vm_offset_t, vm_offset_t);
+void pmap_bootstrap(vm_offset_t, vm_offset_t);
void *pmap_mapdev(vm_offset_t, vm_size_t);
void pmap_unmapdev(vm_offset_t, vm_size_t);
pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2;
vm_page_t pmap_use_pt(pmap_t, vm_offset_t);
-#ifdef SMP
void pmap_set_opt(void);
-#endif
+void pmap_invalidate_page(pmap_t, vm_offset_t);
+void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
+void pmap_invalidate_all(pmap_t);
#endif /* _KERNEL */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 872c5eca1615..d669c51056b7 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -51,6 +51,8 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */
* Interprocessor interrupts for SMP.
*/
#define IPI_INVLTLB XINVLTLB_OFFSET
+#define IPI_INVLPG XINVLPG_OFFSET
+#define IPI_INVLRNG XINVLRNG_OFFSET
#define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET
#define IPI_AST XCPUAST_OFFSET
#define IPI_STOP XCPUSTOP_OFFSET
@@ -107,7 +109,6 @@ void assign_apic_irq(int apic, int intpin, int irq);
void revoke_apic_irq(int irq);
void bsp_apic_configure(void);
void init_secondary(void);
-void smp_invltlb(void);
void forward_statclock(void);
void forwarded_statclock(struct trapframe frame);
void forward_hardclock(void);
@@ -119,6 +120,13 @@ void ipi_self(u_int ipi);
#ifdef APIC_INTR_REORDER
void set_lapic_isrloc(int, int);
#endif /* APIC_INTR_REORDER */
+void smp_invlpg(vm_offset_t addr);
+void smp_masked_invlpg(u_int mask, vm_offset_t addr);
+void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva);
+void smp_masked_invlpg_range(u_int mask, vm_offset_t startva,
+ vm_offset_t endva);
+void smp_invltlb(void);
+void smp_masked_invltlb(u_int mask);
/* global data in mpapic.c */
extern volatile lapic_t lapic;
diff --git a/sys/amd64/isa/intr_machdep.h b/sys/amd64/isa/intr_machdep.h
index 41542d0bb34c..7179268ba6a4 100644
--- a/sys/amd64/isa/intr_machdep.h
+++ b/sys/amd64/isa/intr_machdep.h
@@ -88,6 +88,7 @@
/* IDT vector base for regular (aka. slow) and fast interrupts */
#define TPR_SLOW_INTS 0x20
#define TPR_FAST_INTS 0x60
+/* XXX note that the AST interrupt is at 0x50 */
/* blocking values for local APIC Task Priority Register */
#define TPR_BLOCK_HWI 0x4f /* hardware INTs */
@@ -104,20 +105,23 @@
#endif /** TEST_TEST1 */
/* TLB shootdowns */
-#define XINVLTLB_OFFSET (ICU_OFFSET + 112)
+#define XINVLTLB_OFFSET (ICU_OFFSET + 112) /* 0x90 */
+#define XINVLPG_OFFSET (ICU_OFFSET + 113) /* 0x91 */
+#define XINVLRNG_OFFSET (ICU_OFFSET + 114) /* 0x92 */
/* inter-cpu clock handling */
-#define XHARDCLOCK_OFFSET (ICU_OFFSET + 113)
-#define XSTATCLOCK_OFFSET (ICU_OFFSET + 114)
+#define XHARDCLOCK_OFFSET (ICU_OFFSET + 120) /* 0x98 */
+#define XSTATCLOCK_OFFSET (ICU_OFFSET + 121) /* 0x99 */
/* inter-CPU rendezvous */
-#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 115)
+#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */
/* IPI to generate an additional software trap at the target CPU */
-#define XCPUAST_OFFSET (ICU_OFFSET + 48)
+/* XXX in the middle of the interrupt range, overlapping IRQ48 */
+#define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */
/* IPI to signal CPUs to stop and wait for another CPU to restart them */
-#define XCPUSTOP_OFFSET (ICU_OFFSET + 128)
+#define XCPUSTOP_OFFSET (ICU_OFFSET + 128) /* 0xA0 */
/*
* Note: this vector MUST be xxxx1111, 32 + 223 = 255 = 0xff:
@@ -194,7 +198,9 @@ inthand_t
IDTVEC(intr28), IDTVEC(intr29), IDTVEC(intr30), IDTVEC(intr31);
inthand_t
- Xinvltlb, /* TLB shootdowns */
+ Xinvltlb, /* TLB shootdowns - global */
+ Xinvlpg, /* TLB shootdowns - 1 page */
+ Xinvlrng, /* TLB shootdowns - page range */
Xhardclock, /* Forward hardclock() */
Xstatclock, /* Forward statclock() */
Xcpuast, /* Additional software trap on other cpu */
diff --git a/sys/conf/options.i386 b/sys/conf/options.i386
index 9f0d22d80f1d..ad82c05deb3e 100644
--- a/sys/conf/options.i386
+++ b/sys/conf/options.i386
@@ -1,10 +1,11 @@
# $FreeBSD$
# Options specific to the i386 platform kernels
-DISABLE_PSE
MATH_EMULATE opt_math_emulate.h
GPL_MATH_EMULATE opt_math_emulate.h
+DISABLE_PSE opt_pmap.h
PMAP_SHPGPERPROC opt_pmap.h
+DISABLE_PG_G opt_pmap.h
PPC_PROBE_CHIPSET opt_ppc.h
PPC_DEBUG opt_ppc.h
SHOW_BUSYBUFS
diff --git a/sys/conf/options.pc98 b/sys/conf/options.pc98
index 49325cbb4187..ed2e2c636d16 100644
--- a/sys/conf/options.pc98
+++ b/sys/conf/options.pc98
@@ -1,10 +1,11 @@
# $FreeBSD$
# Options specific to the pc98 platform kernels
-DISABLE_PSE
MATH_EMULATE opt_math_emulate.h
GPL_MATH_EMULATE opt_math_emulate.h
+DISABLE_PSE opt_pmap.h
PMAP_SHPGPERPROC opt_pmap.h
+DISABLE_PG_G opt_pmap.h
PPC_PROBE_CHIPSET opt_ppc.h
PPC_DEBUG opt_ppc.h
SHOW_BUSYBUFS
diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s
index 8490b1b14ca5..569ed501a468 100644
--- a/sys/i386/i386/apic_vector.s
+++ b/sys/i386/i386/apic_vector.s
@@ -260,30 +260,107 @@ Xspuriousint:
iret
/*
- * Handle TLB shootdowns.
+ * Global address space TLB shootdown.
*/
.text
SUPERALIGN_TEXT
.globl Xinvltlb
Xinvltlb:
pushl %eax
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
#ifdef COUNT_XINVLTLB_HITS
pushl %fs
- movl $KPSEL, %eax
+ movl $KPSEL, %eax /* Private space selector */
mov %ax, %fs
movl PCPU(CPUID), %eax
popl %fs
- ss
- incl xhits(,%eax,4)
+ incl xhits_gbl(,%eax,4)
#endif /* COUNT_XINVLTLB_HITS */
movl %cr3, %eax /* invalidate the TLB */
movl %eax, %cr3
- ss /* stack segment, avoid %ds load */
movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %eax
+ iret
+
+/*
+ * Single page TLB shootdown
+ */
+ .text
+ SUPERALIGN_TEXT
+ .globl Xinvlpg
+Xinvlpg:
+ pushl %eax
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+ pushl %fs
+ movl $KPSEL, %eax /* Private space selector */
+ mov %ax, %fs
+ movl PCPU(CPUID), %eax
+ popl %fs
+ incl xhits_pg(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+ movl smp_tlb_addr1, %eax
+ invlpg (%eax) /* invalidate single page */
+
+ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %eax
+ iret
+
+/*
+ * Page range TLB shootdown.
+ */
+ .text
+ SUPERALIGN_TEXT
+ .globl Xinvlrng
+Xinvlrng:
+ pushl %eax
+ pushl %edx
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+ pushl %fs
+ movl $KPSEL, %eax /* Private space selector */
+ mov %ax, %fs
+ movl PCPU(CPUID), %eax
+ popl %fs
+ incl xhits_rng(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+ movl smp_tlb_addr1, %edx
+ movl smp_tlb_addr2, %eax
+1: invlpg (%edx) /* invalidate single page */
+ addl $PAGE_SIZE, %edx
+ cmpl %edx, %eax
+ jb 1b
+
+ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %edx
popl %eax
iret
diff --git a/sys/i386/i386/bios.c b/sys/i386/i386/bios.c
index 0312adf00bff..6e0837cb89b6 100644
--- a/sys/i386/i386/bios.c
+++ b/sys/i386/i386/bios.c
@@ -323,7 +323,8 @@ bios16(struct bios_args *args, char *fmt, ...)
va_list ap;
int flags = BIOSCODE_FLAG | BIOSDATA_FLAG;
u_int i, arg_start, arg_end;
- u_int *pte, *ptd;
+ pt_entry_t *pte;
+ pd_entry_t *ptd;
arg_start = 0xffffffff;
arg_end = 0;
@@ -382,19 +383,19 @@ bios16(struct bios_args *args, char *fmt, ...)
args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME;
args->seg.code32.limit = 0xffff;
- ptd = (u_int *)rcr3();
+ ptd = (pd_entry_t *)rcr3();
if (ptd == (u_int *)IdlePTD) {
/*
* no page table, so create one and install it.
*/
- pte = (u_int *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
- ptd = (u_int *)((u_int)ptd + KERNBASE);
+ pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
+ ptd = (pd_entry_t *)((u_int)ptd + KERNBASE);
*ptd = vtophys(pte) | PG_RW | PG_V;
} else {
/*
* this is a user-level page table
*/
- pte = (u_int *)&PTmap;
+ pte = PTmap;
}
/*
* install pointer to page 0. we don't need to flush the tlb,
@@ -451,7 +452,7 @@ bios16(struct bios_args *args, char *fmt, ...)
i = bios16_call(&args->r, stack_top);
- if (pte == (u_int *)&PTmap) {
+ if (pte == PTmap) {
*pte = 0; /* remove entry */
} else {
*ptd = 0; /* remove page table */
@@ -461,7 +462,7 @@ bios16(struct bios_args *args, char *fmt, ...)
/*
* XXX only needs to be invlpg(0) but that doesn't work on the 386
*/
- invltlb();
+ pmap_invalidate_all(kernel_pmap);
return (i);
}
diff --git a/sys/i386/i386/db_interface.c b/sys/i386/i386/db_interface.c
index 2ba81daeefee..ec32a58a14fb 100644
--- a/sys/i386/i386/db_interface.c
+++ b/sys/i386/i386/db_interface.c
@@ -276,7 +276,7 @@ db_write_bytes(addr, size, data)
}
}
- invltlb();
+ pmap_invalidate_all(kernel_pmap);
}
dst = (char *)addr;
@@ -292,7 +292,7 @@ db_write_bytes(addr, size, data)
if (ptep1)
*ptep1 = oldmap1;
- invltlb();
+ pmap_invalidate_all(kernel_pmap);
}
}
diff --git a/sys/i386/i386/locore.s b/sys/i386/i386/locore.s
index d06065da0524..94a3a103ac03 100644
--- a/sys/i386/i386/locore.s
+++ b/sys/i386/i386/locore.s
@@ -127,6 +127,7 @@ HIDENAME(tmpstk):
.globl bootinfo
bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */
+ .globl KERNend
KERNend: .long 0 /* phys addr end of kernel (just after bss) */
physfree: .long 0 /* phys addr of next free page */
@@ -381,12 +382,6 @@ begin:
movl IdlePTD,%esi
movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax)
- testl $CPUID_PGE, R(cpu_feature)
- jz 1f
- movl %cr4, %eax
- orl $CR4_PGE, %eax
- movl %eax, %cr4
-1:
pushl physfree /* value of first for init386(first) */
call init386 /* wire 386 chip for unix operation */
@@ -809,14 +804,7 @@ no_kernend:
jne map_read_write
#endif
xorl %edx,%edx
-
-#if !defined(SMP)
- testl $CPUID_PGE, R(cpu_feature)
- jz 2f
- orl $PG_G,%edx
-#endif
-
-2: movl $R(etext),%ecx
+ movl $R(etext),%ecx
addl $PAGE_MASK,%ecx
shrl $PAGE_SHIFT,%ecx
fillkptphys(%edx)
@@ -827,13 +815,7 @@ no_kernend:
andl $~PAGE_MASK, %eax
map_read_write:
movl $PG_RW,%edx
-#if !defined(SMP)
- testl $CPUID_PGE, R(cpu_feature)
- jz 1f
- orl $PG_G,%edx
-#endif
-
-1: movl R(KERNend),%ecx
+ movl R(KERNend),%ecx
subl %eax,%ecx
shrl $PAGE_SHIFT,%ecx
fillkptphys(%edx)
diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c
index 63fec0e9e9c9..29e9c6eb56fb 100644
--- a/sys/i386/i386/mp_machdep.c
+++ b/sys/i386/i386/mp_machdep.c
@@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt;
struct pcb stoppcbs[MAXCPU];
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
/*
* Local data and functions.
*/
@@ -336,6 +344,9 @@ init_locks(void)
#ifdef USE_COMLOCK
mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
#endif /* USE_COMLOCK */
+#ifdef APIC_IO
+ mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
+#endif
}
/*
@@ -605,6 +616,10 @@ mp_enable(u_int boot_addr)
/* install an inter-CPU IPI for TLB invalidation */
setidt(XINVLTLB_OFFSET, Xinvltlb,
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLPG_OFFSET, Xinvlpg,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLRNG_OFFSET, Xinvlrng,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
/* install an inter-CPU IPI for forwarding hardclock() */
setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS)
-u_int xhits[MAXCPU];
-SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits),
- "IU", "");
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
#endif
/*
* Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+ register_t eflags;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+/*
+ * This is about as magic as it gets. fortune(1) has got similar code
+ * for reversing bits in a word. Who thinks up this stuff??
+ *
+ * Yes, it does appear to be consistently faster than:
+ * while (i = ffs(m)) {
+ * m >>= i;
+ * bits++;
+ * }
+ * and
+ * while (lsb = (m & -m)) { // This is magic too
+ * m &= ~lsb; // or: m ^= lsb
+ * bits++;
+ * }
+ * Both of these latter forms do some very strange things on gcc-3.1 with
+ * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
+ * There is probably an SSE or MMX popcnt instruction.
*
- * XXX: Needs to handshake and wait for completion before proceding.
+ * I wonder if this should be in libkern?
+ *
+ * XXX Stop the presses! Another one:
+ * static __inline u_int32_t
+ * popcnt1(u_int32_t v)
+ * {
+ * v -= ((v >> 1) & 0x55555555);
+ * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ * v = (v + (v >> 4)) & 0x0F0F0F0F;
+ * return (v * 0x01010101) >> 24;
+ * }
+ * The downside is that it has a multiply. With a pentium3 with
+ * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
+ * an imull, and in that case it is faster. In most other cases
+ * it appears slightly slower.
*/
+static __inline u_int32_t
+popcnt(u_int32_t m)
+{
+
+ m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
+ m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
+ m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
+ m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
+ m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
+ return m;
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int ncpu, othercpus;
+ register_t eflags;
+
+ othercpus = mp_ncpus - 1;
+ if (mask == (u_int)-1) {
+ ncpu = othercpus;
+ if (ncpu < 1)
+ return;
+ } else {
+ /* XXX there should be a pcpu self mask */
+ mask &= ~(1 << PCPU_GET(cpuid));
+ if (mask == 0)
+ return;
+ ncpu = popcnt(mask);
+ if (ncpu > othercpus) {
+ /* XXX this should be a panic offence */
+ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+ ncpu, othercpus);
+ ncpu = othercpus;
+ }
+ /* XXX should be a panic, implied by mask == 0 above */
+ if (ncpu < 1)
+ return;
+ }
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (mask == (u_int)-1)
+ ipi_all_but_self(vector);
+ else
+ ipi_selected(mask, vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
void
smp_invltlb(void)
{
#if defined(APIC_IO)
- if (smp_started)
- ipi_all_but_self(IPI_INVLTLB);
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
#endif /* APIC_IO */
}
void
-invlpg(u_int addr)
+smp_invlpg(vm_offset_t addr)
{
- __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
{
- u_long temp;
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3() is
- * inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, vm_offset_t addr)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
@@ -2251,7 +2455,7 @@ ap_init(void)
/* spin */ ;
/* BSP may have changed PTD while we were waiting */
- cpu_invltlb();
+ invltlb();
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -2290,6 +2494,9 @@ ap_init(void)
/* Build our map of 'other' CPUs. */
PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
+ if (bootverbose)
+ apic_dump("ap_init()");
+
printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
if (smp_cpus == mp_ncpus) {
@@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame)
{
mtx_lock_spin(&sched_lock);
- statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+ statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+ TRAPF_USERMODE(&frame));
mtx_unlock_spin(&sched_lock);
}
diff --git a/sys/i386/i386/mpapic.c b/sys/i386/i386/mpapic.c
index c42373b39635..85346bf6fff5 100644
--- a/sys/i386/i386/mpapic.c
+++ b/sys/i386/i386/mpapic.c
@@ -101,9 +101,6 @@ apic_initialize(void)
#endif /** TEST_TEST1 */
lapic.svr = temp;
-
- if (bootverbose)
- apic_dump("apic_initialize()");
}
diff --git a/sys/i386/i386/mptable.c b/sys/i386/i386/mptable.c
index 63fec0e9e9c9..29e9c6eb56fb 100644
--- a/sys/i386/i386/mptable.c
+++ b/sys/i386/i386/mptable.c
@@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt;
struct pcb stoppcbs[MAXCPU];
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
/*
* Local data and functions.
*/
@@ -336,6 +344,9 @@ init_locks(void)
#ifdef USE_COMLOCK
mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
#endif /* USE_COMLOCK */
+#ifdef APIC_IO
+ mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
+#endif
}
/*
@@ -605,6 +616,10 @@ mp_enable(u_int boot_addr)
/* install an inter-CPU IPI for TLB invalidation */
setidt(XINVLTLB_OFFSET, Xinvltlb,
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLPG_OFFSET, Xinvlpg,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLRNG_OFFSET, Xinvlrng,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
/* install an inter-CPU IPI for forwarding hardclock() */
setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS)
-u_int xhits[MAXCPU];
-SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits),
- "IU", "");
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
#endif
/*
* Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+ register_t eflags;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+/*
+ * This is about as magic as it gets. fortune(1) has got similar code
+ * for reversing bits in a word. Who thinks up this stuff??
+ *
+ * Yes, it does appear to be consistently faster than:
+ * while (i = ffs(m)) {
+ * m >>= i;
+ * bits++;
+ * }
+ * and
+ * while (lsb = (m & -m)) { // This is magic too
+ * m &= ~lsb; // or: m ^= lsb
+ * bits++;
+ * }
+ * Both of these latter forms do some very strange things on gcc-3.1 with
+ * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
+ * There is probably an SSE or MMX popcnt instruction.
*
- * XXX: Needs to handshake and wait for completion before proceding.
+ * I wonder if this should be in libkern?
+ *
+ * XXX Stop the presses! Another one:
+ * static __inline u_int32_t
+ * popcnt1(u_int32_t v)
+ * {
+ * v -= ((v >> 1) & 0x55555555);
+ * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ * v = (v + (v >> 4)) & 0x0F0F0F0F;
+ * return (v * 0x01010101) >> 24;
+ * }
+ * The downside is that it has a multiply. With a pentium3 with
+ * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
+ * an imull, and in that case it is faster. In most other cases
+ * it appears slightly slower.
*/
+static __inline u_int32_t
+popcnt(u_int32_t m)
+{
+
+ m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
+ m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
+ m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
+ m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
+ m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
+ return m;
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int ncpu, othercpus;
+ register_t eflags;
+
+ othercpus = mp_ncpus - 1;
+ if (mask == (u_int)-1) {
+ ncpu = othercpus;
+ if (ncpu < 1)
+ return;
+ } else {
+ /* XXX there should be a pcpu self mask */
+ mask &= ~(1 << PCPU_GET(cpuid));
+ if (mask == 0)
+ return;
+ ncpu = popcnt(mask);
+ if (ncpu > othercpus) {
+ /* XXX this should be a panic offence */
+ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+ ncpu, othercpus);
+ ncpu = othercpus;
+ }
+ /* XXX should be a panic, implied by mask == 0 above */
+ if (ncpu < 1)
+ return;
+ }
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (mask == (u_int)-1)
+ ipi_all_but_self(vector);
+ else
+ ipi_selected(mask, vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
void
smp_invltlb(void)
{
#if defined(APIC_IO)
- if (smp_started)
- ipi_all_but_self(IPI_INVLTLB);
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
#endif /* APIC_IO */
}
void
-invlpg(u_int addr)
+smp_invlpg(vm_offset_t addr)
{
- __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
{
- u_long temp;
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3() is
- * inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, vm_offset_t addr)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
@@ -2251,7 +2455,7 @@ ap_init(void)
/* spin */ ;
/* BSP may have changed PTD while we were waiting */
- cpu_invltlb();
+ invltlb();
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -2290,6 +2494,9 @@ ap_init(void)
/* Build our map of 'other' CPUs. */
PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
+ if (bootverbose)
+ apic_dump("ap_init()");
+
printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
if (smp_cpus == mp_ncpus) {
@@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame)
{
mtx_lock_spin(&sched_lock);
- statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+ statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+ TRAPF_USERMODE(&frame));
mtx_unlock_spin(&sched_lock);
}
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 87cd8b9ceb94..5de170703aef 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -68,7 +68,6 @@
* and to when physical maps must be made correct.
*/
-#include "opt_disable_pse.h"
#include "opt_pmap.h"
#include "opt_msgbuf.h"
#include "opt_kstack_pages.h"
@@ -85,6 +84,9 @@
#include <sys/user.h>
#include <sys/vmmeter.h>
#include <sys/sysctl.h>
+#ifdef SMP
+#include <sys/smp.h>
+#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -97,6 +99,7 @@
#include <vm/vm_pager.h>
#include <vm/uma.h>
+#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
@@ -162,6 +165,7 @@ static vm_object_t kptobj;
static int nkpt;
vm_offset_t kernel_vm_end;
+extern u_int32_t KERNend;
/*
* Data for the pv entry allocation mechanism
@@ -257,10 +261,10 @@ static vm_offset_t
pmap_kmem_choose(vm_offset_t addr)
{
vm_offset_t newaddr = addr;
+
#ifndef DISABLE_PSE
- if (cpu_feature & CPUID_PSE) {
+ if (cpu_feature & CPUID_PSE)
newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
- }
#endif
return newaddr;
}
@@ -362,10 +366,9 @@ pmap_bootstrap(firstaddr, loadaddr)
PTD[i] = 0;
pgeflag = 0;
-#if !defined(SMP) /* XXX - see also mp_machdep.c */
- if (cpu_feature & CPUID_PGE) {
+#ifndef DISABLE_PG_G
+ if (cpu_feature & CPUID_PGE)
pgeflag = PG_G;
- }
#endif
/*
@@ -378,7 +381,7 @@ pmap_bootstrap(firstaddr, loadaddr)
*/
pdir4mb = 0;
-#if !defined(DISABLE_PSE)
+#ifndef DISABLE_PSE
if (cpu_feature & CPUID_PSE) {
pd_entry_t ptditmp;
/*
@@ -389,29 +392,16 @@ pmap_bootstrap(firstaddr, loadaddr)
ptditmp &= ~(NBPDR - 1);
ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
pdir4mb = ptditmp;
-
-#if !defined(SMP)
- /*
- * Enable the PSE mode.
- */
- load_cr4(rcr4() | CR4_PSE);
-
- /*
- * We can do the mapping here for the single processor
- * case. We simply ignore the old page table page from
- * now on.
- */
- /*
- * For SMP, we still need 4K pages to bootstrap APs,
- * PSE will be enabled as soon as all APs are up.
- */
- PTD[KPTDI] = (pd_entry_t) ptditmp;
- kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
- invltlb();
-#endif
}
#endif
-
+#ifndef SMP
+ /*
+ * Turn on PGE/PSE. SMP does this later on since the
+ * 4K page tables are required for AP boot (for now).
+ * XXX fixme.
+ */
+ pmap_set_opt();
+#endif
#ifdef SMP
if (cpu_apic_address == 0)
panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
@@ -420,26 +410,55 @@ pmap_bootstrap(firstaddr, loadaddr)
SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
(cpu_apic_address & PG_FRAME));
#endif
-
invltlb();
}
-#ifdef SMP
/*
- * Set 4mb pdir for mp startup
+ * Enable 4MB page mode for MP startup. Turn on PG_G support.
+ * BSP will run this after all the AP's have started up.
*/
void
pmap_set_opt(void)
{
+ pt_entry_t *pte;
+ vm_offset_t va, endva;
+
+ if (pgeflag && (cpu_feature & CPUID_PGE)) {
+ load_cr4(rcr4() | CR4_PGE);
+ invltlb(); /* Insurance */
+ }
+#ifndef DISABLE_PSE
if (pseflag && (cpu_feature & CPUID_PSE)) {
load_cr4(rcr4() | CR4_PSE);
- if (pdir4mb && PCPU_GET(cpuid) == 0) { /* only on BSP */
+ invltlb(); /* Insurance */
+ }
+#endif
+ if (PCPU_GET(cpuid) == 0) {
+#ifndef DISABLE_PSE
+ if (pdir4mb) {
kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
- cpu_invltlb();
+ invltlb(); /* Insurance */
}
+#endif
+ if (pgeflag) {
+ /* Turn on PG_G for text, data, bss pages. */
+ va = (vm_offset_t)btext;
+ endva = KERNBASE + KERNend;
+ while (va < endva) {
+ pte = vtopte(va);
+ if (*pte)
+ *pte |= pgeflag;
+ va += PAGE_SIZE;
+ }
+ invltlb(); /* Insurance */
+ }
+ /*
+ * We do not need to broadcast the invltlb here, because
+ * each AP does it the moment it is released from the boot
+ * lock. See ap_init().
+ */
}
}
-#endif
void *
pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
@@ -553,43 +572,151 @@ pmap_track_modified(vm_offset_t va)
return 0;
}
-static PMAP_INLINE void
-invltlb_1pg(vm_offset_t va)
-{
#ifdef I386_CPU
- invltlb();
-#else
- invlpg(va);
-#endif
+/*
+ * i386 only has "invalidate everything" and no SMP to worry about.
+ */
+PMAP_INLINE void
+pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invltlb();
}
-static __inline void
+PMAP_INLINE void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invltlb();
+}
+
+PMAP_INLINE void
+pmap_invalidate_all(pmap_t pmap)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invltlb();
+}
+#else /* !I386_CPU */
+#ifdef SMP
+/*
+ * For SMP, these functions have to use the IPI mechanism for coherence.
+ */
+void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
-#if defined(SMP)
- if (pmap->pm_active & PCPU_GET(cpumask))
- cpu_invlpg((void *)va);
- if (pmap->pm_active & PCPU_GET(other_cpus))
- smp_invltlb();
-#else
- if (pmap->pm_active)
- invltlb_1pg(va);
-#endif
+ u_int cpumask;
+ u_int other_cpus;
+
+ critical_enter();
+ /*
+ * We need to disable interrupt preemption but MUST NOT have
+ * interrupts disabled here.
+ * XXX we may need to hold schedlock to get a coherent pm_active
+ */
+ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+ invlpg(va);
+ smp_invlpg(va);
+ } else {
+ cpumask = PCPU_GET(cpumask);
+ other_cpus = PCPU_GET(other_cpus);
+ if (pmap->pm_active & cpumask)
+ invlpg(va);
+ if (pmap->pm_active & other_cpus)
+ smp_masked_invlpg(pmap->pm_active & other_cpus, va);
+ }
+ critical_exit();
}
-static __inline void
+void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ u_int cpumask;
+ u_int other_cpus;
+ vm_offset_t addr;
+
+ critical_enter();
+ /*
+ * We need to disable interrupt preemption but MUST NOT have
+ * interrupts disabled here.
+ * XXX we may need to hold schedlock to get a coherent pm_active
+ */
+ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ smp_invlpg_range(sva, eva);
+ } else {
+ cpumask = PCPU_GET(cpumask);
+ other_cpus = PCPU_GET(other_cpus);
+ if (pmap->pm_active & cpumask)
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ if (pmap->pm_active & other_cpus)
+ smp_masked_invlpg_range(pmap->pm_active & other_cpus,
+ sva, eva);
+ }
+ critical_exit();
+}
+
+void
pmap_invalidate_all(pmap_t pmap)
{
-#if defined(SMP)
- if (pmap->pm_active & PCPU_GET(cpumask))
- cpu_invltlb();
- if (pmap->pm_active & PCPU_GET(other_cpus))
+ u_int cpumask;
+ u_int other_cpus;
+
+ critical_enter();
+ /*
+ * We need to disable interrupt preemption but MUST NOT have
+ * interrupts disabled here.
+ * XXX we may need to hold schedlock to get a coherent pm_active
+ */
+ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
+ invltlb();
smp_invltlb();
-#else
- if (pmap->pm_active)
+ } else {
+ cpumask = PCPU_GET(cpumask);
+ other_cpus = PCPU_GET(other_cpus);
+ if (pmap->pm_active & cpumask)
+ invltlb();
+ if (pmap->pm_active & other_cpus)
+ smp_masked_invltlb(pmap->pm_active & other_cpus);
+ }
+ critical_exit();
+}
+#else /* !SMP */
+/*
+ * Normal, non-SMP, 486+ invalidation functions.
+ * We inline these within pmap.c for speed.
+ */
+PMAP_INLINE void
+pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ invlpg(va);
+}
+
+PMAP_INLINE void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ vm_offset_t addr;
+
+ if (pmap == kernel_pmap || pmap->pm_active)
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+}
+
+PMAP_INLINE void
+pmap_invalidate_all(pmap_t pmap)
+{
+
+ if (pmap == kernel_pmap || pmap->pm_active)
invltlb();
-#endif
}
+#endif /* !SMP */
+#endif /* !I386_CPU */
/*
* Return an address which is the base of the Virtual mapping of
@@ -613,12 +740,7 @@ get_ptbase(pmap)
/* otherwise, we are alternate address space */
if (frame != (APTDpde & PG_FRAME)) {
APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
-#if defined(SMP)
- /* The page directory is not shared between CPUs */
- cpu_invltlb();
-#else
invltlb();
-#endif
}
return APTmap;
}
@@ -647,7 +769,7 @@ pmap_pte_quick(pmap, va)
newpf = pde & PG_FRAME;
if (((*PMAP1) & PG_FRAME) != newpf) {
*PMAP1 = newpf | PG_RW | PG_V;
- invltlb_1pg((vm_offset_t) PADDR1);
+ pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1);
}
return PADDR1 + (index & (NPTEPG - 1));
}
@@ -692,34 +814,29 @@ pmap_extract(pmap, va)
***************************************************/
/*
- * add a wired page to the kva
- * note that in order for the mapping to take effect -- you
- * should do a invltlb after doing the pmap_kenter...
+ * Add a wired page to the kva.
+ * Note: not SMP coherent.
*/
PMAP_INLINE void
pmap_kenter(vm_offset_t va, vm_offset_t pa)
{
pt_entry_t *pte;
- pt_entry_t npte, opte;
- npte = pa | PG_RW | PG_V | pgeflag;
pte = vtopte(va);
- opte = *pte;
- *pte = npte;
- invltlb_1pg(va);
+ *pte = pa | PG_RW | PG_V | pgeflag;
}
/*
- * remove a page from the kernel pagetables
+ * Remove a page from the kernel pagetables.
+ * Note: not SMP coherent.
*/
PMAP_INLINE void
pmap_kremove(vm_offset_t va)
{
- register pt_entry_t *pte;
+ pt_entry_t *pte;
pte = vtopte(va);
*pte = 0;
- invltlb_1pg(va);
}
/*
@@ -737,13 +854,15 @@ pmap_kremove(vm_offset_t va)
vm_offset_t
pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
{
- vm_offset_t sva = *virt;
- vm_offset_t va = sva;
+ vm_offset_t va, sva;
+
+ va = sva = *virt;
while (start < end) {
pmap_kenter(va, start);
va += PAGE_SIZE;
start += PAGE_SIZE;
}
+ pmap_invalidate_range(kernel_pmap, sva, va);
*virt = va;
return (sva);
}
@@ -756,64 +875,45 @@ pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
* page modification or references recorded.
* Note that old mappings are simply written
* over. The page *must* be wired.
+ * Note: SMP coherent. Uses a ranged shootdown IPI.
*/
void
-pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
+pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
{
- vm_offset_t end_va;
-
- end_va = va + count * PAGE_SIZE;
-
- while (va < end_va) {
- pt_entry_t *pte;
+ vm_offset_t va;
- pte = vtopte(va);
- *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
-#ifdef SMP
- cpu_invlpg((void *)va);
-#else
- invltlb_1pg(va);
-#endif
+ va = sva;
+ while (count-- > 0) {
+ pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
va += PAGE_SIZE;
m++;
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, sva, va);
}
/*
- * this routine jerks page mappings from the
+ * This routine tears out page mappings from the
* kernel -- it is meant only for temporary mappings.
+ * Note: SMP coherent. Uses a ranged shootdown IPI.
*/
void
-pmap_qremove(vm_offset_t va, int count)
+pmap_qremove(vm_offset_t sva, int count)
{
- vm_offset_t end_va;
-
- end_va = va + count*PAGE_SIZE;
-
- while (va < end_va) {
- pt_entry_t *pte;
+ vm_offset_t va;
- pte = vtopte(va);
- *pte = 0;
-#ifdef SMP
- cpu_invlpg((void *)va);
-#else
- invltlb_1pg(va);
-#endif
+ va = sva;
+ while (count-- > 0) {
+ pmap_kremove(va);
va += PAGE_SIZE;
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, sva, va);
}
static vm_page_t
pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
{
vm_page_t m;
+
retry:
m = vm_page_lookup(object, pindex);
if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
@@ -829,14 +929,11 @@ retry:
void
pmap_new_thread(struct thread *td)
{
-#ifdef I386_CPU
- int updateneeded = 0;
-#endif
int i;
+ vm_page_t ma[KSTACK_PAGES];
vm_object_t ksobj;
vm_page_t m;
vm_offset_t ks;
- pt_entry_t *ptek, oldpte;
/*
* allocate object for the kstack
@@ -844,39 +941,21 @@ pmap_new_thread(struct thread *td)
ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES);
td->td_kstack_obj = ksobj;
-#ifdef KSTACK_GUARD
/* get a kernel virtual address for the kstack for this thread */
+#ifdef KSTACK_GUARD
ks = kmem_alloc_nofault(kernel_map, (KSTACK_PAGES + 1) * PAGE_SIZE);
if (ks == 0)
panic("pmap_new_thread: kstack allocation failed");
-
- /*
- * Set the first page to be the unmapped guard page.
- */
- ptek = vtopte(ks);
- oldpte = *ptek;
- *ptek = 0;
- if (oldpte) {
-#ifdef I386_CPU
- updateneeded = 1;
-#else
- invlpg(ks);
-#endif
- }
-
- /*
- * move to the next page, which is where the real stack starts.
- */
+ if (*vtopte(ks) != 0)
+ pmap_qremove(ks, 1);
ks += PAGE_SIZE;
td->td_kstack = ks;
- ptek++;
#else
/* get a kernel virtual address for the kstack for this thread */
ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE);
if (ks == 0)
panic("pmap_new_thread: kstack allocation failed");
td->td_kstack = ks;
- ptek = vtopte(ks);
#endif
/*
* For the length of the stack, link in a real page of ram for each
@@ -887,6 +966,7 @@ pmap_new_thread(struct thread *td)
* Get a kernel stack page
*/
m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+ ma[i] = m;
/*
* Wire the page
@@ -894,28 +974,12 @@ pmap_new_thread(struct thread *td)
m->wire_count++;
cnt.v_wire_count++;
- /*
- * Enter the page into the kernel address space.
- */
- oldpte = ptek[i];
- ptek[i] = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
- if (oldpte) {
-#ifdef I386_CPU
- updateneeded = 1;
-#else
- invlpg(ks + (i * PAGE_SIZE));
-#endif
- }
-
vm_page_wakeup(m);
vm_page_flag_clear(m, PG_ZERO);
vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
m->valid = VM_PAGE_BITS_ALL;
}
-#ifdef I386_CPU
- if (updateneeded)
- invltlb();
-#endif
+ pmap_qenter(ks, ma, KSTACK_PAGES);
}
/*
@@ -930,26 +994,18 @@ pmap_dispose_thread(td)
vm_object_t ksobj;
vm_offset_t ks;
vm_page_t m;
- pt_entry_t *ptek;
ksobj = td->td_kstack_obj;
ks = td->td_kstack;
- ptek = vtopte(ks);
+ pmap_qremove(ks, KSTACK_PAGES);
for (i = 0; i < KSTACK_PAGES; i++) {
m = vm_page_lookup(ksobj, i);
if (m == NULL)
panic("pmap_dispose_thread: kstack already missing?");
vm_page_busy(m);
- ptek[i] = 0;
-#ifndef I386_CPU
- invlpg(ks + (i * PAGE_SIZE));
-#endif
vm_page_unwire(m, 0);
vm_page_free(m);
}
-#ifdef I386_CPU
- invltlb();
-#endif
/*
* Free the space that this stack was mapped to in the kernel
* address map.
@@ -976,13 +1032,13 @@ pmap_swapout_thread(td)
ksobj = td->td_kstack_obj;
ks = td->td_kstack;
+ pmap_qremove(ks, KSTACK_PAGES);
for (i = 0; i < KSTACK_PAGES; i++) {
m = vm_page_lookup(ksobj, i);
if (m == NULL)
panic("pmap_swapout_thread: kstack already missing?");
vm_page_dirty(m);
vm_page_unwire(m, 0);
- pmap_kremove(ks + i * PAGE_SIZE);
}
}
@@ -994,6 +1050,7 @@ pmap_swapin_thread(td)
struct thread *td;
{
int i, rv;
+ vm_page_t ma[KSTACK_PAGES];
vm_object_t ksobj;
vm_offset_t ks;
vm_page_t m;
@@ -1002,7 +1059,6 @@ pmap_swapin_thread(td)
ks = td->td_kstack;
for (i = 0; i < KSTACK_PAGES; i++) {
m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
- pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
if (m->valid != VM_PAGE_BITS_ALL) {
rv = vm_pager_get_pages(ksobj, &m, 1, 0);
if (rv != VM_PAGER_OK)
@@ -1010,10 +1066,12 @@ pmap_swapin_thread(td)
m = vm_page_lookup(ksobj, i);
m->valid = VM_PAGE_BITS_ALL;
}
+ ma[i] = m;
vm_page_wire(m);
vm_page_wakeup(m);
vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
}
+ pmap_qenter(ks, ma, KSTACK_PAGES);
}
/***************************************************
@@ -1108,7 +1166,8 @@ pmap_pinit0(pmap)
{
pmap->pm_pdir =
(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
- pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
+ pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t)IdlePTD);
+ invlpg((vm_offset_t)pmap->pm_pdir);
pmap->pm_ptphint = NULL;
pmap->pm_active = 0;
TAILQ_INIT(&pmap->pm_pvlist);
@@ -1153,7 +1212,7 @@ pmap_pinit(pmap)
vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
ptdpg->valid = VM_PAGE_BITS_ALL;
- pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
+ pmap_qenter((vm_offset_t) pmap->pm_pdir, &ptdpg, 1);
if ((ptdpg->flags & PG_ZERO) == 0)
bzero(pmap->pm_pdir, PAGE_SIZE);
@@ -1616,7 +1675,7 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
* PG_G.
*/
if (oldpte & PG_G)
- invlpg(va);
+ pmap_invalidate_page(kernel_pmap, va);
pmap->pm_stats.resident_count -= 1;
if (oldpte & PG_MANAGED) {
m = PHYS_TO_VM_PAGE(oldpte);
@@ -2028,13 +2087,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
if ((origpte & PG_RW) == 0) {
*pte |= PG_RW;
-#ifdef SMP
- cpu_invlpg((void *)va);
- if (pmap->pm_active & PCPU_GET(other_cpus))
- smp_invltlb();
-#else
- invltlb_1pg(va);
-#endif
+ pmap_invalidate_page(pmap, va);
}
return;
}
@@ -2102,13 +2155,7 @@ validate:
if ((origpte & ~(PG_M|PG_A)) != newpte) {
*pte = newpte | PG_A;
/*if (origpte)*/ {
-#ifdef SMP
- cpu_invlpg((void *)va);
- if (pmap->pm_active & PCPU_GET(other_cpus))
- smp_invltlb();
-#else
- invltlb_1pg(va);
-#endif
+ pmap_invalidate_page(pmap, va);
}
}
}
@@ -2222,7 +2269,11 @@ retry:
void *
pmap_kenter_temporary(vm_offset_t pa, int i)
{
- pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
+ vm_offset_t va;
+
+ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
+ pmap_kenter(va, pa);
+ invlpg(va);
return ((void *)crashdumpmap);
}
@@ -2527,7 +2578,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
vm_offset_t pdnxt;
pd_entry_t src_frame, dst_frame;
vm_page_t m;
- pd_entry_t saved_pde;
if (dst_addr != src_addr)
return;
@@ -2537,17 +2587,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
return;
dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
- if (dst_frame != (APTDpde & PG_FRAME)) {
- APTDpde = dst_frame | PG_RW | PG_V;
-#if defined(SMP)
- /* The page directory is not shared between CPUs */
- cpu_invltlb();
-#else
- invltlb();
-#endif
- }
- saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V);
- for(addr = src_addr; addr < end_addr; addr = pdnxt) {
+ for (addr = src_addr; addr < end_addr; addr = pdnxt) {
pt_entry_t *src_pte, *dst_pte;
vm_page_t dstmpte, srcmpte;
pd_entry_t srcptepaddr;
@@ -2588,6 +2628,14 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
if (pdnxt > end_addr)
pdnxt = end_addr;
+ /*
+ * Have to recheck this before every avtopte() call below
+ * in case we have blocked and something else used APTDpde.
+ */
+ if (dst_frame != (APTDpde & PG_FRAME)) {
+ APTDpde = dst_frame | PG_RW | PG_V;
+ invltlb();
+ }
src_pte = vtopte(addr);
dst_pte = avtopte(addr);
while (addr < pdnxt) {
@@ -2603,16 +2651,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
* block.
*/
dstmpte = pmap_allocpte(dst_pmap, addr);
- if ((APTDpde & PG_FRAME) !=
- (saved_pde & PG_FRAME)) {
- APTDpde = saved_pde;
-printf ("IT HAPPENNED!");
-#if defined(SMP)
- cpu_invltlb();
-#else
- invltlb();
-#endif
- }
if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
/*
* Clear the modified and
@@ -2644,14 +2682,13 @@ printf ("IT HAPPENNED!");
void
pmap_zero_page(vm_page_t m)
{
- vm_offset_t phys = VM_PAGE_TO_PHYS(m);
+ vm_offset_t phys;
+ phys = VM_PAGE_TO_PHYS(m);
if (*CMAP2)
panic("pmap_zero_page: CMAP2 busy");
-
*CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M;
- invltlb_1pg((vm_offset_t)CADDR2);
-
+ pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2);
#if defined(I686_CPU)
if (cpu_class == CPUCLASS_686)
i686_pagezero(CADDR2);
@@ -2670,14 +2707,13 @@ pmap_zero_page(vm_page_t m)
void
pmap_zero_page_area(vm_page_t m, int off, int size)
{
- vm_offset_t phys = VM_PAGE_TO_PHYS(m);
+ vm_offset_t phys;
+ phys = VM_PAGE_TO_PHYS(m);
if (*CMAP2)
panic("pmap_zero_page: CMAP2 busy");
-
*CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M;
- invltlb_1pg((vm_offset_t)CADDR2);
-
+ pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2);
#if defined(I686_CPU)
if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
i686_pagezero(CADDR2);
@@ -2696,20 +2732,13 @@ pmap_zero_page_area(vm_page_t m, int off, int size)
void
pmap_zero_page_idle(vm_page_t m)
{
- vm_offset_t phys = VM_PAGE_TO_PHYS(m);
+ vm_offset_t phys;
+ phys = VM_PAGE_TO_PHYS(m);
if (*CMAP3)
panic("pmap_zero_page: CMAP3 busy");
-
*CMAP3 = PG_V | PG_RW | phys | PG_A | PG_M;
-#ifdef SMP
- mtx_lock(&Giant); /* IPI sender not MPSAFE */
-#endif
- invltlb_1pg((vm_offset_t)CADDR3);
-#ifdef SMP
- mtx_unlock(&Giant);
-#endif
-
+ invlpg((vm_offset_t)CADDR3); /* SMP: local cpu only */
#if defined(I686_CPU)
if (cpu_class == CPUCLASS_686)
i686_pagezero(CADDR3);
@@ -2733,18 +2762,15 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
panic("pmap_copy_page: CMAP1 busy");
if (*CMAP2)
panic("pmap_copy_page: CMAP2 busy");
-
*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
-#ifdef I386_CPU
- invltlb();
-#else
- invlpg((u_int)CADDR1);
- invlpg((u_int)CADDR2);
-#endif
-
+ /*
+ * XXX we "know" that CADDR2 immediately follows CADDR1 and use
+ * that to save an IPI on SMP systems.
+ */
+ pmap_invalidate_range(kernel_pmap, (vm_offset_t)CADDR1,
+ (vm_offset_t)CADDR2 + PAGE_SIZE);
bcopy(CADDR1, CADDR2, PAGE_SIZE);
-
*CMAP1 = 0;
*CMAP2 = 0;
}
@@ -3176,18 +3202,11 @@ pmap_mapdev(pa, size)
for (tmpva = va; size > 0; ) {
pte = vtopte(tmpva);
*pte = pa | PG_RW | PG_V | pgeflag;
-#ifdef SMP
- cpu_invlpg((void *)tmpva);
-#else
- invltlb_1pg(tmpva);
-#endif
size -= PAGE_SIZE;
tmpva += PAGE_SIZE;
pa += PAGE_SIZE;
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, va, tmpva);
return ((void *)(va + offset));
}
@@ -3205,15 +3224,8 @@ pmap_unmapdev(va, size)
for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
pte = vtopte(tmpva);
*pte = 0;
-#ifdef SMP
- cpu_invlpg((void *)tmpva);
-#else
- invltlb_1pg(tmpva);
-#endif
}
-#ifdef SMP
- smp_invltlb();
-#endif
+ pmap_invalidate_range(kernel_pmap, va, tmpva);
kmem_free(kernel_map, base, size);
}
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
index c1f38995f135..23c611cfbd25 100644
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -1596,42 +1596,6 @@ ENTRY(ssdtosd)
popl %ebx
ret
-/* load_cr0(cr0) */
-ENTRY(load_cr0)
- movl 4(%esp),%eax
- movl %eax,%cr0
- ret
-
-/* rcr0() */
-ENTRY(rcr0)
- movl %cr0,%eax
- ret
-
-/* rcr3() */
-ENTRY(rcr3)
- movl %cr3,%eax
- ret
-
-/* void load_cr3(caddr_t cr3) */
-ENTRY(load_cr3)
-#ifdef SWTCH_OPTIM_STATS
- incl tlb_flush_count
-#endif
- movl 4(%esp),%eax
- movl %eax,%cr3
- ret
-
-/* rcr4() */
-ENTRY(rcr4)
- movl %cr4,%eax
- ret
-
-/* void load_cr4(caddr_t cr4) */
-ENTRY(load_cr4)
- movl 4(%esp),%eax
- movl %eax,%cr4
- ret
-
/* void reset_dbregs() */
ENTRY(reset_dbregs)
movl $0,%eax
diff --git a/sys/i386/i386/vm86.c b/sys/i386/i386/vm86.c
index eb0c98bff831..c03757fbfb72 100644
--- a/sys/i386/i386/vm86.c
+++ b/sys/i386/i386/vm86.c
@@ -603,6 +603,7 @@ vm86_datacall(intnum, vmf, vmc)
entry = vmc->pmap[i].pte_num;
vmc->pmap[i].old_pte = pte[entry];
pte[entry] = page | PG_V | PG_RW | PG_U;
+ pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
}
vmf->vmf_trapno = intnum;
@@ -611,6 +612,7 @@ vm86_datacall(intnum, vmf, vmc)
for (i = 0; i < vmc->npages; i++) {
entry = vmc->pmap[i].pte_num;
pte[entry] = vmc->pmap[i].old_pte;
+ pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
}
mtx_unlock(&vm86_lock);
diff --git a/sys/i386/include/cpufunc.h b/sys/i386/include/cpufunc.h
index 2e64138de4c7..0896659c864e 100644
--- a/sys/i386/include/cpufunc.h
+++ b/sys/i386/include/cpufunc.h
@@ -237,62 +237,6 @@ invd(void)
__asm __volatile("invd");
}
-#if defined(SMP) && defined(_KERNEL)
-
-/*
- * When using APIC IPI's, invlpg() is not simply the invlpg instruction
- * (this is a bug) and the inlining cost is prohibitive since the call
- * executes into the IPI transmission system.
- */
-void invlpg(u_int addr);
-void invltlb(void);
-
-static __inline void
-cpu_invlpg(void *addr)
-{
- __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
-}
-
-static __inline void
-cpu_invltlb(void)
-{
- u_int temp;
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3()
- * is inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
- : : "memory");
-#if defined(SWTCH_OPTIM_STATS)
- ++tlb_flush_count;
-#endif
-}
-
-#else /* !(SMP && _KERNEL) */
-
-static __inline void
-invlpg(u_int addr)
-{
- __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
-}
-
-static __inline void
-invltlb(void)
-{
- u_int temp;
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3()
- * is inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
- : : "memory");
-#ifdef SWTCH_OPTIM_STATS
- ++tlb_flush_count;
-#endif
-}
-
-#endif /* SMP && _KERNEL */
-
static __inline u_short
inw(u_int port)
{
@@ -364,15 +308,6 @@ ia32_pause(void)
}
static __inline u_int
-rcr2(void)
-{
- u_int data;
-
- __asm __volatile("movl %%cr2,%0" : "=r" (data));
- return (data);
-}
-
-static __inline u_int
read_eflags(void)
{
u_int ef;
@@ -426,6 +361,86 @@ wrmsr(u_int msr, u_int64_t newval)
__asm __volatile("wrmsr" : : "A" (newval), "c" (msr));
}
+static __inline void
+load_cr0(u_int data)
+{
+
+ __asm __volatile("movl %0,%%cr0" : : "r" (data));
+}
+
+static __inline u_int
+rcr0(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr0,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline u_int
+rcr2(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr2,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr3(u_int data)
+{
+
+ __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory");
+#if defined(SWTCH_OPTIM_STATS)
+ ++tlb_flush_count;
+#endif
+}
+
+static __inline u_int
+rcr3(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr3,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr4(u_int data)
+{
+ __asm __volatile("movl %0,%%cr4" : : "r" (data));
+}
+
+static __inline u_int
+rcr4(void)
+{
+ u_int data;
+
+ __asm __volatile("movl %%cr4,%0" : "=r" (data));
+ return (data);
+}
+
+/*
+ * Global TLB flush (except for thise for pages marked PG_G)
+ */
+static __inline void
+invltlb(void)
+{
+
+ load_cr3(rcr3());
+}
+
+/*
+ * TLB flush for an individual page (even if it has PG_G).
+ * Only works on 486+ CPUs (i386 does not have PG_G).
+ */
+static __inline void
+invlpg(u_int addr)
+{
+
+ __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
+}
+
static __inline u_int
rfs(void)
{
@@ -587,6 +602,8 @@ intr_restore(register_t eflags)
int breakpoint(void);
u_int bsfl(u_int mask);
u_int bsrl(u_int mask);
+void cpu_invlpg(u_int addr);
+void cpu_invlpg_range(u_int start, u_int end);
void disable_intr(void);
void do_cpuid(u_int ax, u_int *p);
void enable_intr(void);
@@ -597,8 +614,14 @@ void insl(u_int port, void *addr, size_t cnt);
void insw(u_int port, void *addr, size_t cnt);
void invd(void);
void invlpg(u_int addr);
+void invlpg_range(u_int start, u_int end);
void invltlb(void);
u_short inw(u_int port);
+void load_cr0(u_int cr0);
+void load_cr3(u_int cr3);
+void load_cr4(u_int cr4);
+void load_fs(u_int sel);
+void load_gs(u_int sel);
void outb(u_int port, u_char data);
void outl(u_int port, u_int data);
void outsb(u_int port, void *addr, size_t cnt);
@@ -606,7 +629,12 @@ void outsl(u_int port, void *addr, size_t cnt);
void outsw(u_int port, void *addr, size_t cnt);
void outw(u_int port, u_short data);
void ia32_pause(void);
+u_int rcr0(void);
u_int rcr2(void);
+u_int rcr3(void);
+u_int rcr4(void);
+u_int rfs(void);
+u_int rgs(void);
u_int64_t rdmsr(u_int msr);
u_int64_t rdpmc(u_int pmc);
u_int64_t rdtsc(void);
@@ -614,10 +642,6 @@ u_int read_eflags(void);
void wbinvd(void);
void write_eflags(u_int ef);
void wrmsr(u_int msr, u_int64_t newval);
-u_int rfs(void);
-u_int rgs(void);
-void load_fs(u_int sel);
-void load_gs(u_int sel);
u_int rdr0(void);
void load_dr0(u_int dr0);
u_int rdr1(void);
@@ -639,13 +663,7 @@ void intr_restore(register_t ef);
#endif /* __GNUC__ */
-void load_cr0(u_int cr0);
-void load_cr3(u_int cr3);
-void load_cr4(u_int cr4);
void ltr(u_short sel);
-u_int rcr0(void);
-u_int rcr3(void);
-u_int rcr4(void);
void reset_dbregs(void);
__END_DECLS
diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h
index 63fec0e9e9c9..29e9c6eb56fb 100644
--- a/sys/i386/include/mptable.h
+++ b/sys/i386/include/mptable.h
@@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt;
struct pcb stoppcbs[MAXCPU];
+#ifdef APIC_IO
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+static struct mtx smp_tlb_mtx;
+#endif
+
/*
* Local data and functions.
*/
@@ -336,6 +344,9 @@ init_locks(void)
#ifdef USE_COMLOCK
mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
#endif /* USE_COMLOCK */
+#ifdef APIC_IO
+ mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
+#endif
}
/*
@@ -605,6 +616,10 @@ mp_enable(u_int boot_addr)
/* install an inter-CPU IPI for TLB invalidation */
setidt(XINVLTLB_OFFSET, Xinvltlb,
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLPG_OFFSET, Xinvlpg,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(XINVLRNG_OFFSET, Xinvlrng,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
/* install an inter-CPU IPI for forwarding hardclock() */
setidt(XHARDCLOCK_OFFSET, Xhardclock,
@@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS)
-u_int xhits[MAXCPU];
-SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits),
- "IU", "");
+#if defined(APIC_IO)
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
#endif
/*
* Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+ register_t eflags;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+
+/*
+ * This is about as magic as it gets. fortune(1) has got similar code
+ * for reversing bits in a word. Who thinks up this stuff??
+ *
+ * Yes, it does appear to be consistently faster than:
+ * while (i = ffs(m)) {
+ * m >>= i;
+ * bits++;
+ * }
+ * and
+ * while (lsb = (m & -m)) { // This is magic too
+ * m &= ~lsb; // or: m ^= lsb
+ * bits++;
+ * }
+ * Both of these latter forms do some very strange things on gcc-3.1 with
+ * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
+ * There is probably an SSE or MMX popcnt instruction.
*
- * XXX: Needs to handshake and wait for completion before proceding.
+ * I wonder if this should be in libkern?
+ *
+ * XXX Stop the presses! Another one:
+ * static __inline u_int32_t
+ * popcnt1(u_int32_t v)
+ * {
+ * v -= ((v >> 1) & 0x55555555);
+ * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ * v = (v + (v >> 4)) & 0x0F0F0F0F;
+ * return (v * 0x01010101) >> 24;
+ * }
+ * The downside is that it has a multiply. With a pentium3 with
+ * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
+ * an imull, and in that case it is faster. In most other cases
+ * it appears slightly slower.
*/
+static __inline u_int32_t
+popcnt(u_int32_t m)
+{
+
+ m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
+ m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
+ m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
+ m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
+ m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
+ return m;
+}
+
+static void
+smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int ncpu, othercpus;
+ register_t eflags;
+
+ othercpus = mp_ncpus - 1;
+ if (mask == (u_int)-1) {
+ ncpu = othercpus;
+ if (ncpu < 1)
+ return;
+ } else {
+ /* XXX there should be a pcpu self mask */
+ mask &= ~(1 << PCPU_GET(cpuid));
+ if (mask == 0)
+ return;
+ ncpu = popcnt(mask);
+ if (ncpu > othercpus) {
+ /* XXX this should be a panic offence */
+ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
+ ncpu, othercpus);
+ ncpu = othercpus;
+ }
+ /* XXX should be a panic, implied by mask == 0 above */
+ if (ncpu < 1)
+ return;
+ }
+ eflags = read_eflags();
+ if ((eflags & PSL_I) == 0)
+ panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
+ mtx_lock_spin(&smp_tlb_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (mask == (u_int)-1)
+ ipi_all_but_self(vector);
+ else
+ ipi_selected(mask, vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_tlb_mtx);
+}
+#endif
+
void
smp_invltlb(void)
{
#if defined(APIC_IO)
- if (smp_started)
- ipi_all_but_self(IPI_INVLTLB);
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
#endif /* APIC_IO */
}
void
-invlpg(u_int addr)
+smp_invlpg(vm_offset_t addr)
{
- __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
void
-invltlb(void)
+smp_masked_invltlb(u_int mask)
{
- u_long temp;
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /*
- * This should be implemented as load_cr3(rcr3()) when load_cr3() is
- * inlined.
- */
- __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+void
+smp_masked_invlpg(u_int mask, vm_offset_t addr)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+#endif /* APIC_IO */
+}
- /* send a message to the other CPUs */
- smp_invltlb();
+void
+smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+#if defined(APIC_IO)
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+#endif /* APIC_IO */
}
@@ -2251,7 +2455,7 @@ ap_init(void)
/* spin */ ;
/* BSP may have changed PTD while we were waiting */
- cpu_invltlb();
+ invltlb();
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -2290,6 +2494,9 @@ ap_init(void)
/* Build our map of 'other' CPUs. */
PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
+ if (bootverbose)
+ apic_dump("ap_init()");
+
printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
if (smp_cpus == mp_ncpus) {
@@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame)
{
mtx_lock_spin(&sched_lock);
- statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame));
+ statclock_process(curthread->td_kse, TRAPF_PC(&frame),
+ TRAPF_USERMODE(&frame));
mtx_unlock_spin(&sched_lock);
}
diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h
index e6ac669aeaa8..e0789fc68f43 100644
--- a/sys/i386/include/pmap.h
+++ b/sys/i386/include/pmap.h
@@ -151,7 +151,7 @@ extern pt_entry_t PTmap[], APTmap[];
extern pd_entry_t PTD[], APTD[];
extern pd_entry_t PTDpde, APTDpde;
-extern pd_entry_t IdlePTD; /* physical address of "Idle" state directory */
+extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */
#endif
#ifdef _KERNEL
@@ -253,14 +253,15 @@ extern char *ptvmmap; /* poor name! */
extern vm_offset_t virtual_avail;
extern vm_offset_t virtual_end;
-void pmap_bootstrap( vm_offset_t, vm_offset_t);
+void pmap_bootstrap(vm_offset_t, vm_offset_t);
void *pmap_mapdev(vm_offset_t, vm_size_t);
void pmap_unmapdev(vm_offset_t, vm_size_t);
pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2;
vm_page_t pmap_use_pt(pmap_t, vm_offset_t);
-#ifdef SMP
void pmap_set_opt(void);
-#endif
+void pmap_invalidate_page(pmap_t, vm_offset_t);
+void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
+void pmap_invalidate_all(pmap_t);
#endif /* _KERNEL */
diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h
index 872c5eca1615..d669c51056b7 100644
--- a/sys/i386/include/smp.h
+++ b/sys/i386/include/smp.h
@@ -51,6 +51,8 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */
* Interprocessor interrupts for SMP.
*/
#define IPI_INVLTLB XINVLTLB_OFFSET
+#define IPI_INVLPG XINVLPG_OFFSET
+#define IPI_INVLRNG XINVLRNG_OFFSET
#define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET
#define IPI_AST XCPUAST_OFFSET
#define IPI_STOP XCPUSTOP_OFFSET
@@ -107,7 +109,6 @@ void assign_apic_irq(int apic, int intpin, int irq);
void revoke_apic_irq(int irq);
void bsp_apic_configure(void);
void init_secondary(void);
-void smp_invltlb(void);
void forward_statclock(void);
void forwarded_statclock(struct trapframe frame);
void forward_hardclock(void);
@@ -119,6 +120,13 @@ void ipi_self(u_int ipi);
#ifdef APIC_INTR_REORDER
void set_lapic_isrloc(int, int);
#endif /* APIC_INTR_REORDER */
+void smp_invlpg(vm_offset_t addr);
+void smp_masked_invlpg(u_int mask, vm_offset_t addr);
+void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva);
+void smp_masked_invlpg_range(u_int mask, vm_offset_t startva,
+ vm_offset_t endva);
+void smp_invltlb(void);
+void smp_masked_invltlb(u_int mask);
/* global data in mpapic.c */
extern volatile lapic_t lapic;
diff --git a/sys/i386/isa/apic_vector.s b/sys/i386/isa/apic_vector.s
index 8490b1b14ca5..569ed501a468 100644
--- a/sys/i386/isa/apic_vector.s
+++ b/sys/i386/isa/apic_vector.s
@@ -260,30 +260,107 @@ Xspuriousint:
iret
/*
- * Handle TLB shootdowns.
+ * Global address space TLB shootdown.
*/
.text
SUPERALIGN_TEXT
.globl Xinvltlb
Xinvltlb:
pushl %eax
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
#ifdef COUNT_XINVLTLB_HITS
pushl %fs
- movl $KPSEL, %eax
+ movl $KPSEL, %eax /* Private space selector */
mov %ax, %fs
movl PCPU(CPUID), %eax
popl %fs
- ss
- incl xhits(,%eax,4)
+ incl xhits_gbl(,%eax,4)
#endif /* COUNT_XINVLTLB_HITS */
movl %cr3, %eax /* invalidate the TLB */
movl %eax, %cr3
- ss /* stack segment, avoid %ds load */
movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %eax
+ iret
+
+/*
+ * Single page TLB shootdown
+ */
+ .text
+ SUPERALIGN_TEXT
+ .globl Xinvlpg
+Xinvlpg:
+ pushl %eax
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+ pushl %fs
+ movl $KPSEL, %eax /* Private space selector */
+ mov %ax, %fs
+ movl PCPU(CPUID), %eax
+ popl %fs
+ incl xhits_pg(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+ movl smp_tlb_addr1, %eax
+ invlpg (%eax) /* invalidate single page */
+
+ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %eax
+ iret
+
+/*
+ * Page range TLB shootdown.
+ */
+ .text
+ SUPERALIGN_TEXT
+ .globl Xinvlrng
+Xinvlrng:
+ pushl %eax
+ pushl %edx
+ pushl %ds
+ movl $KDSEL, %eax /* Kernel data selector */
+ mov %ax, %ds
+
+#ifdef COUNT_XINVLTLB_HITS
+ pushl %fs
+ movl $KPSEL, %eax /* Private space selector */
+ mov %ax, %fs
+ movl PCPU(CPUID), %eax
+ popl %fs
+ incl xhits_rng(,%eax,4)
+#endif /* COUNT_XINVLTLB_HITS */
+
+ movl smp_tlb_addr1, %edx
+ movl smp_tlb_addr2, %eax
+1: invlpg (%edx) /* invalidate single page */
+ addl $PAGE_SIZE, %edx
+ cmpl %edx, %eax
+ jb 1b
+
+ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popl %ds
+ popl %edx
popl %eax
iret
diff --git a/sys/i386/isa/intr_machdep.h b/sys/i386/isa/intr_machdep.h
index 41542d0bb34c..7179268ba6a4 100644
--- a/sys/i386/isa/intr_machdep.h
+++ b/sys/i386/isa/intr_machdep.h
@@ -88,6 +88,7 @@
/* IDT vector base for regular (aka. slow) and fast interrupts */
#define TPR_SLOW_INTS 0x20
#define TPR_FAST_INTS 0x60
+/* XXX note that the AST interrupt is at 0x50 */
/* blocking values for local APIC Task Priority Register */
#define TPR_BLOCK_HWI 0x4f /* hardware INTs */
@@ -104,20 +105,23 @@
#endif /** TEST_TEST1 */
/* TLB shootdowns */
-#define XINVLTLB_OFFSET (ICU_OFFSET + 112)
+#define XINVLTLB_OFFSET (ICU_OFFSET + 112) /* 0x90 */
+#define XINVLPG_OFFSET (ICU_OFFSET + 113) /* 0x91 */
+#define XINVLRNG_OFFSET (ICU_OFFSET + 114) /* 0x92 */
/* inter-cpu clock handling */
-#define XHARDCLOCK_OFFSET (ICU_OFFSET + 113)
-#define XSTATCLOCK_OFFSET (ICU_OFFSET + 114)
+#define XHARDCLOCK_OFFSET (ICU_OFFSET + 120) /* 0x98 */
+#define XSTATCLOCK_OFFSET (ICU_OFFSET + 121) /* 0x99 */
/* inter-CPU rendezvous */
-#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 115)
+#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */
/* IPI to generate an additional software trap at the target CPU */
-#define XCPUAST_OFFSET (ICU_OFFSET + 48)
+/* XXX in the middle of the interrupt range, overlapping IRQ48 */
+#define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */
/* IPI to signal CPUs to stop and wait for another CPU to restart them */
-#define XCPUSTOP_OFFSET (ICU_OFFSET + 128)
+#define XCPUSTOP_OFFSET (ICU_OFFSET + 128) /* 0xA0 */
/*
* Note: this vector MUST be xxxx1111, 32 + 223 = 255 = 0xff:
@@ -194,7 +198,9 @@ inthand_t
IDTVEC(intr28), IDTVEC(intr29), IDTVEC(intr30), IDTVEC(intr31);
inthand_t
- Xinvltlb, /* TLB shootdowns */
+ Xinvltlb, /* TLB shootdowns - global */
+ Xinvlpg, /* TLB shootdowns - 1 page */
+ Xinvlrng, /* TLB shootdowns - page range */
Xhardclock, /* Forward hardclock() */
Xstatclock, /* Forward statclock() */
Xcpuast, /* Additional software trap on other cpu */
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index 73934cb2e3a7..444b087a2880 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -223,6 +223,9 @@ static struct witness_order_list_entry order_lists[] = {
{ "icu", &lock_class_mtx_spin },
#ifdef SMP
{ "smp rendezvous", &lock_class_mtx_spin },
+#if defined(__i386__) && defined(APIC_IO)
+ { "tlb", &lock_class_mtx_spin },
+#endif
#endif
{ "clk", &lock_class_mtx_spin },
{ "mutex profiling lock", &lock_class_mtx_spin },