diff options
author | Konstantin Belousov <kib@FreeBSD.org> | 2016-10-04 17:01:24 +0000 |
---|---|---|
committer | Konstantin Belousov <kib@FreeBSD.org> | 2016-10-04 17:01:24 +0000 |
commit | 83c001d3c29d98363d10394e51913a7fc118def3 (patch) | |
tree | 9ea73ab5a922a17426c5b7c848e98b430874b119 /sys/x86 | |
parent | 37d0ac15e64daaac7e908b85d3852444782645d7 (diff) | |
download | src-83c001d3c29d98363d10394e51913a7fc118def3.tar.gz src-83c001d3c29d98363d10394e51913a7fc118def3.zip |
Re-apply r306516 (by cem):
Reduce the cost of TLB invalidation on x86 by using per-CPU completion flags
Reduce contention during TLB invalidation operations by using a per-CPU
completion flag, rather than a single atomically-updated variable.
On a Westmere system (2 sockets x 4 cores x 1 threads), dtrace measurements
show that smp_tlb_shootdown is about 50% faster with this patch; observations
with VTune show that the percentage of time spent in invlrng_single_page on an
interrupt (actually doing invalidation, rather than synchronization) increases
from 31% with the old mechanism to 71% with the new one. (Running a basic file
server workload.)
Submitted by: Anton Rang <rang at acm.org>
Reviewed by: cem (earlier version)
Sponsored by: Dell EMC Isilon
Differential Revision: https://reviews.freebsd.org/D8041
Notes
Notes:
svn path=/head/; revision=306680
Diffstat (limited to 'sys/x86')
-rw-r--r-- | sys/x86/include/x86_smp.h | 2 | ||||
-rw-r--r-- | sys/x86/x86/mp_x86.c | 63 |
2 files changed, 47 insertions, 18 deletions
diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h index 38d762546e12..7c906dd6ad08 100644 --- a/sys/x86/include/x86_smp.h +++ b/sys/x86/include/x86_smp.h @@ -35,7 +35,7 @@ extern volatile int aps_ready; extern struct mtx ap_boot_mtx; extern int cpu_logical; extern int cpu_cores; -extern volatile int smp_tlb_wait; +extern volatile uint32_t smp_tlb_generation; extern struct pmap *smp_tlb_pmap; extern u_int xhits_gbl[]; extern u_int xhits_pg[]; diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 91c119adf0b5..19071f60061b 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1304,12 +1304,22 @@ cpususpend_handler(void) void invlcache_handler(void) { + uint32_t generation; + #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + /* + * Reading the generation here allows greater parallelism + * since wbinvd is a serializing instruction. Without the + * temporary, we'd wait for wbinvd to complete, then the read + * would execute, then the dependent write, whuch must then + * complete before return from interrupt. + */ + generation = smp_tlb_generation; wbinvd(); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } /* @@ -1367,7 +1377,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); /* Variables needed for SMP tlb shootdown. */ static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; pmap_t smp_tlb_pmap; -volatile int smp_tlb_wait; +volatile uint32_t smp_tlb_generation; #ifdef __amd64__ #define read_eflags() read_rflags() @@ -1377,15 +1387,16 @@ static void smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { - int cpu, ncpu, othercpus; - - othercpus = mp_ncpus - 1; /* does not shootdown self */ + cpuset_t other_cpus; + volatile uint32_t *p_cpudone; + uint32_t generation; + int cpu; /* * Check for other cpus. Return if none. */ if (CPU_ISFULLSET(&mask)) { - if (othercpus < 1) + if (mp_ncpus <= 1) return; } else { CPU_CLR(PCPU_GET(cpuid), &mask); @@ -1399,23 +1410,28 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; smp_tlb_pmap = pmap; - smp_tlb_wait = 0; + generation = ++smp_tlb_generation; if (CPU_ISFULLSET(&mask)) { - ncpu = othercpus; ipi_all_but_self(vector); + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); } else { - ncpu = 0; + other_cpus = mask; while ((cpu = CPU_FFS(&mask)) != 0) { cpu--; CPU_CLR(cpu, &mask); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, vector); ipi_send_cpu(cpu, vector); - ncpu++; } } - while (smp_tlb_wait < ncpu) - ia32_pause(); + while ((cpu = CPU_FFS(&other_cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &other_cpus); + p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; + while (*p_cpudone != generation) + ia32_pause(); + } mtx_unlock_spin(&smp_ipi_mtx); } @@ -1473,6 +1489,8 @@ smp_cache_flush(void) void invltlb_handler(void) { + uint32_t generation; + #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ @@ -1480,16 +1498,23 @@ invltlb_handler(void) (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + /* + * Reading the generation here allows greater parallelism + * since invalidating the TLB is a serializing operation. + */ + generation = smp_tlb_generation; if (smp_tlb_pmap == kernel_pmap) invltlb_glob(); else invltlb(); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } void invlpg_handler(void) { + uint32_t generation; + #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ @@ -1497,14 +1522,16 @@ invlpg_handler(void) (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + generation = smp_tlb_generation; /* Overlap with serialization */ invlpg(smp_tlb_addr1); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } void invlrng_handler(void) { - vm_offset_t addr; + vm_offset_t addr, addr2; + uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; @@ -1514,10 +1541,12 @@ invlrng_handler(void) #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + generation = smp_tlb_generation; /* Overlap with serialization */ do { invlpg(addr); addr += PAGE_SIZE; - } while (addr < smp_tlb_addr2); + } while (addr < addr2); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } |