aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorJeff Roberson <jeff@FreeBSD.org>2018-01-12 22:48:23 +0000
committerJeff Roberson <jeff@FreeBSD.org>2018-01-12 22:48:23 +0000
commit3f289c3fcf39b200550e2702068014cdd801d4da (patch)
tree9105f2f717d4e6d5f5dc06d92b4b158e3c831d9e /sys
parentfe8be58826d91f5b80b50bac161b727bd4ea9846 (diff)
downloadsrc-3f289c3fcf39b200550e2702068014cdd801d4da.tar.gz
src-3f289c3fcf39b200550e2702068014cdd801d4da.zip
Implement 'domainset', a cpuset based NUMA policy mechanism. This allows
userspace to control NUMA policy administratively and programmatically. Implement domainset based iterators in the page layer. Remove the now legacy numa_* syscalls. Cleanup some header polution created by having seq.h in proc.h. Reviewed by: markj, kib Discussed with: alc Tested by: pho Sponsored by: Netflix, Dell/EMC Isilon Differential Revision: https://reviews.freebsd.org/D13403
Notes
Notes: svn path=/head/; revision=327895
Diffstat (limited to 'sys')
-rw-r--r--sys/arm/arm/machdep_ptrace.c1
-rw-r--r--sys/compat/freebsd32/freebsd32_misc.c18
-rw-r--r--sys/compat/freebsd32/syscalls.master17
-rw-r--r--sys/conf/files3
-rw-r--r--sys/ddb/db_run.c1
-rw-r--r--sys/kern/init_main.c6
-rw-r--r--sys/kern/init_sysent.c6
-rw-r--r--sys/kern/kern_cpuset.c1139
-rw-r--r--sys/kern/kern_exit.c5
-rw-r--r--sys/kern/kern_fork.c9
-rw-r--r--sys/kern/kern_numa.c169
-rw-r--r--sys/kern/kern_thr.c8
-rw-r--r--sys/kern/kern_thread.c19
-rw-r--r--sys/kern/makesyscalls.sh1
-rw-r--r--sys/kern/sched_4bsd.c1
-rw-r--r--sys/kern/sched_ule.c1
-rw-r--r--sys/kern/subr_kdb.c1
-rw-r--r--sys/kern/syscalls.master16
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel.c1
-rw-r--r--sys/sys/_vm_domain.h63
-rw-r--r--sys/sys/cpuset.h1
-rw-r--r--sys/sys/proc.h14
-rw-r--r--sys/sys/syscallsubr.h7
-rw-r--r--sys/vm/vm_domain.c514
-rw-r--r--sys/vm/vm_domain.h71
-rw-r--r--sys/vm/vm_fault.c1
-rw-r--r--sys/vm/vm_object.c3
-rw-r--r--sys/vm/vm_object.h2
-rw-r--r--sys/vm/vm_page.c85
-rw-r--r--sys/vm/vm_page.h3
-rw-r--r--sys/vm/vm_phys.c6
-rw-r--r--sys/vm/vm_phys.h4
-rw-r--r--sys/x86/acpica/srat.c3
33 files changed, 1130 insertions, 1069 deletions
diff --git a/sys/arm/arm/machdep_ptrace.c b/sys/arm/arm/machdep_ptrace.c
index 5bd07d28eca7..2a64f89441cc 100644
--- a/sys/arm/arm/machdep_ptrace.c
+++ b/sys/arm/arm/machdep_ptrace.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
+#include <sys/lock.h>
#include <sys/mutex.h>
#include <machine/machdep.h>
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index 4eaa06e8ec71..4849b80f421f 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -3017,6 +3017,24 @@ freebsd32_cpuset_setaffinity(struct thread *td,
}
int
+freebsd32_cpuset_getdomain(struct thread *td,
+ struct freebsd32_cpuset_getdomain_args *uap)
+{
+
+ return (kern_cpuset_getdomain(td, uap->level, uap->which,
+ PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy));
+}
+
+int
+freebsd32_cpuset_setdomain(struct thread *td,
+ struct freebsd32_cpuset_setdomain_args *uap)
+{
+
+ return (kern_cpuset_setdomain(td, uap->level, uap->which,
+ PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy));
+}
+
+int
freebsd32_nmount(struct thread *td,
struct freebsd32_nmount_args /* {
struct iovec *iovp;
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index 0c94148ad461..707d80824727 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1086,12 +1086,8 @@
547 AUE_FUTIMESAT STD { int freebsd32_utimensat(int fd, \
char *path, \
struct timespec *times, int flag); }
-548 AUE_NULL NOPROTO { int numa_getaffinity(cpuwhich_t which, \
- id_t id, \
- struct vm_domain_policy *policy); }
-549 AUE_NULL NOPROTO { int numa_setaffinity(cpuwhich_t which, \
- id_t id, \
- const struct vm_domain_policy *policy); }
+548 AUE_NULL UNIMPL numa_getaffinity
+549 AUE_NULL UNIMPL numa_setaffinity
550 AUE_FSYNC NOPROTO { int fdatasync(int fd); }
551 AUE_FSTAT STD { int freebsd32_fstat(int fd, \
struct stat32 *ub); }
@@ -1119,4 +1115,13 @@
struct kevent32 *eventlist, \
int nevents, \
const struct timespec32 *timeout); }
+561 AUE_NULL STD { int freebsd32_cpuset_getdomain(cpulevel_t level, \
+ cpuwhich_t which, uint32_t id1, uint32_t id2, \
+ size_t domainsetsize, domainset_t *mask, \
+ int *policy); }
+562 AUE_NULL STD { int freebsd32_cpuset_setdomain(cpulevel_t level, \
+ cpuwhich_t which, uint32_t id1, uint32_t id2, \
+ size_t domainsetsize, domainset_t *mask, \
+ int policy); }
+
; vim: syntax=off
diff --git a/sys/conf/files b/sys/conf/files
index b723aa52f691..da73f12201ff 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3787,7 +3787,6 @@ kern/kern_module.c standard
kern/kern_mtxpool.c standard
kern/kern_mutex.c standard
kern/kern_ntptime.c standard
-kern/kern_numa.c standard
kern/kern_osd.c standard
kern/kern_physio.c standard
kern/kern_pmc.c standard
@@ -4837,7 +4836,7 @@ vm/swap_pager.c standard
vm/uma_core.c standard
vm/uma_dbg.c standard
vm/memguard.c optional DEBUG_MEMGUARD
-vm/vm_domain.c standard
+vm/vm_domainset.c standard
vm/vm_fault.c standard
vm/vm_glue.c standard
vm/vm_init.c standard
diff --git a/sys/ddb/db_run.c b/sys/ddb/db_run.c
index a55fcea9e632..bf38a4d2f726 100644
--- a/sys/ddb/db_run.c
+++ b/sys/ddb/db_run.c
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kdb.h>
#include <sys/proc.h>
+#include <sys/systm.h>
#include <machine/kdb.h>
#include <machine/pcb.h>
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 261006199120..397bc99452eb 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -89,7 +89,6 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
-#include <vm/vm_domain.h>
#include <sys/copyright.h>
#include <ddb/ddb.h>
@@ -497,10 +496,7 @@ proc0_init(void *dummy __unused)
td->td_flags = TDF_INMEM;
td->td_pflags = TDP_KTHREAD;
td->td_cpuset = cpuset_thread0();
- vm_domain_policy_init(&td->td_vm_dom_policy);
- vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1);
- vm_domain_policy_init(&p->p_vm_dom_policy);
- vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1);
+ td->td_domain.dr_policy = td->td_cpuset->cs_domain;
prison0_init();
p->p_peers = 0;
p->p_leader = p;
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 58eedb2a058a..700feabaa1f6 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -599,8 +599,8 @@ struct sysent sysent[] = {
{ AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = ppoll */
{ AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = futimens */
{ AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = utimensat */
- { AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 548 = numa_getaffinity */
- { AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 549 = numa_setaffinity */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = numa_getaffinity */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = numa_setaffinity */
{ AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 550 = fdatasync */
{ AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 551 = fstat */
{ AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = fstatat */
@@ -612,4 +612,6 @@ struct sysent sysent[] = {
{ AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 558 = fhstatfs */
{ AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 559 = mknodat */
{ AS(kevent_args), (sy_call_t *)sys_kevent, AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 560 = kevent */
+ { AS(cpuset_getdomain_args), (sy_call_t *)sys_cpuset_getdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 561 = cpuset_getdomain */
+ { AS(cpuset_setdomain_args), (sy_call_t *)sys_cpuset_setdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 562 = cpuset_setdomain */
};
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
index c913647cda2c..cf51560e8e7f 100644
--- a/sys/kern/kern_cpuset.c
+++ b/sys/kern/kern_cpuset.c
@@ -51,17 +51,21 @@ __FBSDID("$FreeBSD$");
#include <sys/syscallsubr.h>
#include <sys/capsicum.h>
#include <sys/cpuset.h>
+#include <sys/domainset.h>
#include <sys/sx.h>
#include <sys/queue.h>
#include <sys/libkern.h>
#include <sys/limits.h>
#include <sys/bus.h>
#include <sys/interrupt.h>
+#include <sys/vmmeter.h>
#include <vm/uma.h>
#include <vm/vm.h>
+#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
#ifdef DDB
#include <ddb/ddb.h>
@@ -109,8 +113,10 @@ __FBSDID("$FreeBSD$");
* getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
*/
static uma_zone_t cpuset_zone;
+static uma_zone_t domainset_zone;
static struct mtx cpuset_lock;
static struct setlist cpuset_ids;
+static struct domainlist cpuset_domains;
static struct unrhdr *cpuset_unr;
static struct cpuset *cpuset_zero, *cpuset_default;
@@ -121,6 +127,32 @@ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
cpuset_t *cpuset_root;
cpuset_t cpuset_domain[MAXMEMDOM];
+static int domainset_valid(const struct domainset *, const struct domainset *);
+
+/*
+ * Find the first non-anonymous set starting from 'set'.
+ */
+static struct cpuset *
+cpuset_getbase(struct cpuset *set)
+{
+
+ if (set->cs_id == CPUSET_INVALID)
+ set = set->cs_parent;
+ return (set);
+}
+
+/*
+ * Walks up the tree from 'set' to find the root.
+ */
+static struct cpuset *
+cpuset_getroot(struct cpuset *set)
+{
+
+ while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
+ set = set->cs_parent;
+ return (set);
+}
+
/*
* Acquire a reference to a cpuset, all pointers must be tracked with refs.
*/
@@ -140,12 +172,7 @@ static struct cpuset *
cpuset_refroot(struct cpuset *set)
{
- for (; set->cs_parent != NULL; set = set->cs_parent)
- if (set->cs_flags & CPU_SET_ROOT)
- break;
- cpuset_ref(set);
-
- return (set);
+ return (cpuset_ref(cpuset_getroot(set)));
}
/*
@@ -157,11 +184,7 @@ static struct cpuset *
cpuset_refbase(struct cpuset *set)
{
- if (set->cs_id == CPUSET_INVALID)
- set = set->cs_parent;
- cpuset_ref(set);
-
- return (set);
+ return (cpuset_ref(cpuset_getbase(set)));
}
/*
@@ -257,17 +280,25 @@ cpuset_lookup(cpusetid_t setid, struct thread *td)
* will have no valid cpu based on restrictions from the parent.
*/
static int
-_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
- cpusetid_t id)
+_cpuset_create(struct cpuset *set, struct cpuset *parent,
+ const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
{
+ if (domain == NULL)
+ domain = parent->cs_domain;
+ if (mask == NULL)
+ mask = &parent->cs_mask;
if (!CPU_OVERLAP(&parent->cs_mask, mask))
return (EDEADLK);
+ /* The domain must be prepared ahead of time. */
+ if (!domainset_valid(parent->cs_domain, domain))
+ return (EDEADLK);
CPU_COPY(mask, &set->cs_mask);
LIST_INIT(&set->cs_children);
refcount_init(&set->cs_ref, 1);
set->cs_flags = 0;
mtx_lock_spin(&cpuset_lock);
+ set->cs_domain = domain;
CPU_AND(&set->cs_mask, &parent->cs_mask);
set->cs_id = id;
set->cs_parent = cpuset_ref(parent);
@@ -294,8 +325,8 @@ cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
id = alloc_unr(cpuset_unr);
if (id == -1)
return (ENFILE);
- *setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
- error = _cpuset_create(set, parent, mask, id);
+ *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+ error = _cpuset_create(set, parent, mask, NULL, id);
if (error == 0)
return (0);
free_unr(cpuset_unr, id);
@@ -304,6 +335,206 @@ cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
return (error);
}
+static void
+cpuset_freelist_add(struct setlist *list, int count)
+{
+ struct cpuset *set;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
+ LIST_INSERT_HEAD(list, set, cs_link);
+ }
+}
+
+static void
+cpuset_freelist_init(struct setlist *list, int count)
+{
+
+ LIST_INIT(list);
+ cpuset_freelist_add(list, count);
+}
+
+static void
+cpuset_freelist_free(struct setlist *list)
+{
+ struct cpuset *set;
+
+ while ((set = LIST_FIRST(list)) != NULL) {
+ LIST_REMOVE(set, cs_link);
+ uma_zfree(cpuset_zone, set);
+ }
+}
+
+static void
+domainset_freelist_add(struct domainlist *list, int count)
+{
+ struct domainset *set;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
+ LIST_INSERT_HEAD(list, set, ds_link);
+ }
+}
+
+static void
+domainset_freelist_init(struct domainlist *list, int count)
+{
+
+ LIST_INIT(list);
+ domainset_freelist_add(list, count);
+}
+
+static void
+domainset_freelist_free(struct domainlist *list)
+{
+ struct domainset *set;
+
+ while ((set = LIST_FIRST(list)) != NULL) {
+ LIST_REMOVE(set, ds_link);
+ uma_zfree(domainset_zone, set);
+ }
+}
+
+/* Copy a domainset preserving mask and policy. */
+static void
+domainset_copy(const struct domainset *from, struct domainset *to)
+{
+
+ DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
+ to->ds_policy = from->ds_policy;
+ to->ds_prefer = from->ds_prefer;
+}
+
+/* Return 1 if mask and policy are equal, otherwise 0. */
+static int
+domainset_equal(const struct domainset *one, const struct domainset *two)
+{
+
+ return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
+ one->ds_policy == two->ds_policy &&
+ one->ds_prefer == two->ds_prefer);
+}
+
+/* Return 1 if child is a valid subset of parent. */
+static int
+domainset_valid(const struct domainset *parent, const struct domainset *child)
+{
+ if (child->ds_policy != DOMAINSET_POLICY_PREFER)
+ return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
+ return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
+}
+
+static int
+domainset_restrict(const struct domainset *parent,
+ const struct domainset *child)
+{
+ if (child->ds_policy != DOMAINSET_POLICY_PREFER)
+ return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
+ return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
+}
+
+/*
+ * Lookup or create a domainset. The key is provided in ds_mask and
+ * ds_policy. If the domainset does not yet exist the storage in
+ * 'domain' is used to insert. Otherwise this storage is freed to the
+ * domainset_zone and the existing domainset is returned.
+ */
+static struct domainset *
+_domainset_create(struct domainset *domain, struct domainlist *freelist)
+{
+ struct domainset *ndomain;
+
+ mtx_lock_spin(&cpuset_lock);
+ LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
+ if (domainset_equal(ndomain, domain))
+ break;
+ /*
+ * If the domain does not yet exist we insert it and initialize
+ * various iteration helpers which are not part of the key.
+ */
+ if (ndomain == NULL) {
+ LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
+ domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
+ domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+ }
+ mtx_unlock_spin(&cpuset_lock);
+ if (ndomain == NULL)
+ return (domain);
+ if (freelist != NULL)
+ LIST_INSERT_HEAD(freelist, domain, ds_link);
+ else
+ uma_zfree(domainset_zone, domain);
+ return (ndomain);
+
+}
+
+/*
+ * Create or lookup a domainset based on the key held in 'domain'.
+ */
+static struct domainset *
+domainset_create(const struct domainset *domain)
+{
+ struct domainset *ndomain;
+
+ ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
+ domainset_copy(domain, ndomain);
+ return _domainset_create(ndomain, NULL);
+}
+
+/*
+ * Update thread domainset pointers.
+ */
+static void
+domainset_notify(void)
+{
+ struct thread *td;
+ struct proc *p;
+
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ td->td_domain.dr_policy = td->td_cpuset->cs_domain;
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ kernel_object->domain.dr_policy = cpuset_default->cs_domain;
+}
+
+/*
+ * Create a new set that is a subset of a parent.
+ */
+static struct domainset *
+domainset_shadow(const struct domainset *pdomain,
+ const struct domainset *domain, struct domainlist *freelist)
+{
+ struct domainset *ndomain;
+
+ ndomain = LIST_FIRST(freelist);
+ LIST_REMOVE(ndomain, ds_link);
+
+ /*
+ * Initialize the key from the request.
+ */
+ domainset_copy(domain, ndomain);
+
+ /*
+ * Restrict the key by the parent.
+ */
+ DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
+
+ return _domainset_create(ndomain, freelist);
+}
+
/*
* Recursively check for errors that would occur from applying mask to
* the tree of sets starting at 'set'. Checks for sets that would become
@@ -376,10 +607,12 @@ cpuset_modify(struct cpuset *set, cpuset_t *mask)
* Verify that we have access to this set of
* cpus.
*/
- root = set->cs_parent;
- if (root && !CPU_SUBSET(&root->cs_mask, mask))
- return (EINVAL);
+ root = cpuset_getroot(set);
mtx_lock_spin(&cpuset_lock);
+ if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
+ error = EINVAL;
+ goto out;
+ }
error = cpuset_testupdate(set, mask, 0);
if (error)
goto out;
@@ -392,6 +625,141 @@ out:
}
/*
+ * Recursively check for errors that would occur from applying mask to
+ * the tree of sets starting at 'set'. Checks for sets that would become
+ * empty as well as RDONLY flags.
+ */
+static int
+cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
+ struct domainset *orig, int *count, int check_mask)
+{
+ struct cpuset *nset;
+ struct domainset *domain;
+ struct domainset newset;
+ int error;
+
+ mtx_assert(&cpuset_lock, MA_OWNED);
+ if (set->cs_flags & CPU_SET_RDONLY)
+ return (EPERM);
+ domain = set->cs_domain;
+ domainset_copy(domain, &newset);
+ if (!domainset_equal(domain, orig)) {
+ if (!domainset_restrict(domain, dset))
+ return (EDEADLK);
+ DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
+ /* Count the number of domains that are changing. */
+ (*count)++;
+ }
+ error = 0;
+ LIST_FOREACH(nset, &set->cs_children, cs_siblings)
+ if ((error = cpuset_testupdate_domain(nset, &newset, domain,
+ count, 1)) != 0)
+ break;
+ return (error);
+}
+
+/*
+ * Applies the mask 'mask' without checking for empty sets or permissions.
+ */
+static void
+cpuset_update_domain(struct cpuset *set, struct domainset *domain,
+ struct domainset *orig, struct domainlist *domains)
+{
+ struct cpuset *nset;
+
+ mtx_assert(&cpuset_lock, MA_OWNED);
+ /*
+ * If this domainset has changed from the parent we must calculate
+ * a new set. Otherwise it simply inherits from the parent. When
+ * we inherit from the parent we get a new mask and policy. If the
+ * set is modified from the parent we keep the policy and only
+ * update the mask.
+ */
+ if (set->cs_domain != orig) {
+ orig = set->cs_domain;
+ set->cs_domain = domainset_shadow(domain, orig, domains);
+ } else
+ set->cs_domain = domain;
+ LIST_FOREACH(nset, &set->cs_children, cs_siblings)
+ cpuset_update_domain(nset, set->cs_domain, orig, domains);
+
+ return;
+}
+
+/*
+ * Modify the set 'set' to use a copy the domainset provided. Apply this new
+ * mask to restrict all children in the tree. Checks for validity before
+ * applying the changes.
+ */
+static int
+cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
+{
+ struct domainlist domains;
+ struct domainset temp;
+ struct domainset *dset;
+ struct cpuset *root;
+ int ndomains, needed;
+ int error;
+
+ error = priv_check(curthread, PRIV_SCHED_CPUSET);
+ if (error)
+ return (error);
+ /*
+ * In case we are called from within the jail
+ * we do not allow modifying the dedicated root
+ * cpuset of the jail but may still allow to
+ * change child sets.
+ */
+ if (jailed(curthread->td_ucred) &&
+ set->cs_flags & CPU_SET_ROOT)
+ return (EPERM);
+ domainset_freelist_init(&domains, 0);
+ domain = domainset_create(domain);
+ ndomains = needed = 0;
+ do {
+ if (ndomains < needed) {
+ domainset_freelist_add(&domains, needed - ndomains);
+ ndomains = needed;
+ }
+ root = cpuset_getroot(set);
+ mtx_lock_spin(&cpuset_lock);
+ dset = root->cs_domain;
+ /*
+ * Verify that we have access to this set of domains.
+ */
+ if (root && !domainset_valid(dset, domain)) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * If applying prefer we keep the current set as the fallback.
+ */
+ if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
+ DOMAINSET_COPY(&set->cs_domain->ds_mask,
+ &domain->ds_mask);
+ /*
+ * Determine whether we can apply this set of domains and
+ * how many new domain structures it will require.
+ */
+ domainset_copy(domain, &temp);
+ needed = 0;
+ error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
+ &needed, 0);
+ if (error)
+ goto out;
+ } while (ndomains < needed);
+ dset = set->cs_domain;
+ cpuset_update_domain(set, domain, dset, &domains);
+out:
+ mtx_unlock_spin(&cpuset_lock);
+ domainset_freelist_free(&domains);
+ if (error == 0)
+ domainset_notify();
+
+ return (error);
+}
+
+/*
* Resolve the 'which' parameter of several cpuset apis.
*
* For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also
@@ -481,44 +849,203 @@ cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
return (0);
}
+static int
+cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
+ const struct domainset *domain)
+{
+ struct cpuset *parent;
+ struct domainset *dset;
+
+ parent = cpuset_getbase(set);
+ /*
+ * If we are restricting a cpu mask it must be a subset of the
+ * parent or invalid CPUs have been specified.
+ */
+ if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
+ return (EINVAL);
+
+ /*
+ * If we are restricting a domain mask it must be a subset of the
+ * parent or invalid domains have been specified.
+ */
+ dset = parent->cs_domain;
+ if (domain != NULL && !domainset_valid(dset, domain))
+ return (EINVAL);
+
+ return (0);
+}
+
/*
* Create an anonymous set with the provided mask in the space provided by
- * 'fset'. If the passed in set is anonymous we use its parent otherwise
+ * 'nset'. If the passed in set is anonymous we use its parent otherwise
* the new set is a child of 'set'.
*/
static int
-cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
+cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
+ const cpuset_t *mask, const struct domainset *domain,
+ struct setlist *cpusets, struct domainlist *domains)
{
struct cpuset *parent;
+ struct cpuset *nset;
+ struct domainset *dset;
+ struct domainset *d;
+ int error;
- if (set->cs_id == CPUSET_INVALID)
- parent = set->cs_parent;
+ error = cpuset_testshadow(set, mask, domain);
+ if (error)
+ return (error);
+
+ parent = cpuset_getbase(set);
+ dset = parent->cs_domain;
+ if (mask == NULL)
+ mask = &set->cs_mask;
+ if (domain != NULL)
+ d = domainset_shadow(dset, domain, domains);
else
- parent = set;
- if (!CPU_SUBSET(&parent->cs_mask, mask))
+ d = set->cs_domain;
+ nset = LIST_FIRST(cpusets);
+ error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
+ if (error == 0) {
+ LIST_REMOVE(nset, cs_link);
+ *nsetp = nset;
+ }
+ return (error);
+}
+
+static struct cpuset *
+cpuset_update_thread(struct thread *td, struct cpuset *nset)
+{
+ struct cpuset *tdset;
+
+ tdset = td->td_cpuset;
+ td->td_cpuset = nset;
+ td->td_domain.dr_policy = nset->cs_domain;
+ sched_affinity(td);
+
+ return (tdset);
+}
+
+static int
+cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
+ struct domainset *domain)
+{
+ struct cpuset *parent;
+
+ parent = cpuset_getbase(tdset);
+ if (mask == NULL)
+ mask = &tdset->cs_mask;
+ if (domain == NULL)
+ domain = tdset->cs_domain;
+ return cpuset_testshadow(parent, mask, domain);
+}
+
+static int
+cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
+ struct domainset *domain, struct cpuset **nsetp,
+ struct setlist *freelist, struct domainlist *domainlist)
+{
+ struct cpuset *parent;
+
+ parent = cpuset_getbase(tdset);
+ if (mask == NULL)
+ mask = &tdset->cs_mask;
+ if (domain == NULL)
+ domain = tdset->cs_domain;
+ return cpuset_shadow(parent, nsetp, mask, domain, freelist,
+ domainlist);
+}
+
+static int
+cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
+ cpuset_t *mask, struct domainset *domain)
+{
+ struct cpuset *parent;
+
+ parent = cpuset_getbase(tdset);
+
+ /*
+ * If the thread restricted its mask then apply that same
+ * restriction to the new set, otherwise take it wholesale.
+ */
+ if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
+ CPU_COPY(&tdset->cs_mask, mask);
+ CPU_AND(mask, &set->cs_mask);
+ } else
+ CPU_COPY(&set->cs_mask, mask);
+
+ /*
+ * If the thread restricted the domain then we apply the
+ * restriction to the new set but retain the policy.
+ */
+ if (tdset->cs_domain != parent->cs_domain) {
+ domainset_copy(tdset->cs_domain, domain);
+ DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
+ } else
+ domainset_copy(set->cs_domain, domain);
+
+ if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
return (EDEADLK);
- return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
+
+ return (0);
+}
+
+static int
+cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
+{
+ struct domainset domain;
+ cpuset_t mask;
+
+ if (tdset->cs_id != CPUSET_INVALID)
+ return (0);
+ return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
+}
+
+static int
+cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
+ struct cpuset **nsetp, struct setlist *freelist,
+ struct domainlist *domainlist)
+{
+ struct domainset domain;
+ cpuset_t mask;
+ int error;
+
+ /*
+ * If we're replacing on a thread that has not constrained the
+ * original set we can simply accept the new set.
+ */
+ if (tdset->cs_id != CPUSET_INVALID) {
+ *nsetp = cpuset_ref(set);
+ return (0);
+ }
+ error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
+ if (error)
+ return (error);
+
+ return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
+ domainlist);
}
/*
- * Handle two cases for replacing the base set or mask of an entire process.
+ * Handle three cases for updating an entire process.
*
- * 1) Set is non-null and mask is null. This reparents all anonymous sets
- * to the provided set and replaces all non-anonymous td_cpusets with the
- * provided set.
- * 2) Mask is non-null and set is null. This replaces or creates anonymous
- * sets for every thread with the existing base as a parent.
+ * 1) Set is non-null. This reparents all anonymous sets to the provided
+ * set and replaces all non-anonymous td_cpusets with the provided set.
+ * 2) Mask is non-null. This replaces or creates anonymous sets for every
+ * thread with the existing base as a parent.
+ * 3) domain is non-null. This creates anonymous sets for every thread
+ * and replaces the domain set.
*
* This is overly complicated because we can't allocate while holding a
* spinlock and spinlocks must be held while changing and examining thread
* state.
*/
static int
-cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
+cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
+ struct domainset *domain)
{
struct setlist freelist;
struct setlist droplist;
- struct cpuset *tdset;
+ struct domainlist domainlist;
struct cpuset *nset;
struct thread *td;
struct proc *p;
@@ -533,7 +1060,9 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
* 2) If enough cpusets have not been allocated release the locks and
* allocate them. Loop.
*/
- LIST_INIT(&freelist);
+ cpuset_freelist_init(&freelist, 1);
+ domainset_freelist_init(&domainlist, 1);
+ nfree = 1;
LIST_INIT(&droplist);
nfree = 0;
for (;;) {
@@ -544,39 +1073,27 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
break;
threads = p->p_numthreads;
PROC_UNLOCK(p);
- for (; nfree < threads; nfree++) {
- nset = uma_zalloc(cpuset_zone, M_WAITOK);
- LIST_INSERT_HEAD(&freelist, nset, cs_link);
+ if (nfree < threads) {
+ cpuset_freelist_add(&freelist, threads - nfree);
+ domainset_freelist_add(&domainlist, threads - nfree);
+ nfree = threads;
}
}
PROC_LOCK_ASSERT(p, MA_OWNED);
/*
* Now that the appropriate locks are held and we have enough cpusets,
- * make sure the operation will succeed before applying changes. The
+ * make sure the operation will succeed before applying changes. The
* proc lock prevents td_cpuset from changing between calls.
*/
error = 0;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
- tdset = td->td_cpuset;
- /*
- * Verify that a new mask doesn't specify cpus outside of
- * the set the thread is a member of.
- */
- if (mask) {
- if (tdset->cs_id == CPUSET_INVALID)
- tdset = tdset->cs_parent;
- if (!CPU_SUBSET(&tdset->cs_mask, mask))
- error = EDEADLK;
- /*
- * Verify that a new set won't leave an existing thread
- * mask without a cpu to run on. It can, however, restrict
- * the set.
- */
- } else if (tdset->cs_id == CPUSET_INVALID) {
- if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
- error = EDEADLK;
- }
+ if (set != NULL)
+ error = cpuset_setproc_test_setthread(td->td_cpuset,
+ set);
+ else
+ error = cpuset_setproc_test_maskthread(td->td_cpuset,
+ mask, domain);
thread_unlock(td);
if (error)
goto unlock_out;
@@ -588,33 +1105,17 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
*/
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
- /*
- * If we presently have an anonymous set or are applying a
- * mask we must create an anonymous shadow set. That is
- * either parented to our existing base or the supplied set.
- *
- * If we have a base set with no anonymous shadow we simply
- * replace it outright.
- */
- tdset = td->td_cpuset;
- if (tdset->cs_id == CPUSET_INVALID || mask) {
- nset = LIST_FIRST(&freelist);
- LIST_REMOVE(nset, cs_link);
- if (mask)
- error = cpuset_shadow(tdset, nset, mask);
- else
- error = _cpuset_create(nset, set,
- &tdset->cs_mask, CPUSET_INVALID);
- if (error) {
- LIST_INSERT_HEAD(&freelist, nset, cs_link);
- thread_unlock(td);
- break;
- }
- } else
- nset = cpuset_ref(set);
- cpuset_rel_defer(&droplist, tdset);
- td->td_cpuset = nset;
- sched_affinity(td);
+ if (set != NULL)
+ error = cpuset_setproc_setthread(td->td_cpuset, set,
+ &nset, &freelist, &domainlist);
+ else
+ error = cpuset_setproc_maskthread(td->td_cpuset, mask,
+ domain, &nset, &freelist, &domainlist);
+ if (error) {
+ thread_unlock(td);
+ break;
+ }
+ cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
thread_unlock(td);
}
unlock_out:
@@ -622,10 +1123,8 @@ unlock_out:
out:
while ((nset = LIST_FIRST(&droplist)) != NULL)
cpuset_rel_complete(nset);
- while ((nset = LIST_FIRST(&freelist)) != NULL) {
- LIST_REMOVE(nset, cs_link);
- uma_zfree(cpuset_zone, nset);
- }
+ cpuset_freelist_free(&freelist);
+ domainset_freelist_free(&domainlist);
return (error);
}
@@ -690,46 +1189,57 @@ cpusetobj_strscan(cpuset_t *set, const char *buf)
}
/*
- * Apply an anonymous mask to a single thread.
+ * Apply an anonymous mask or a domain to a single thread.
*/
-int
-cpuset_setthread(lwpid_t id, cpuset_t *mask)
+static int
+_cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
{
+ struct setlist cpusets;
+ struct domainlist domainlist;
struct cpuset *nset;
struct cpuset *set;
struct thread *td;
struct proc *p;
int error;
- nset = uma_zalloc(cpuset_zone, M_WAITOK);
+ cpuset_freelist_init(&cpusets, 1);
+ domainset_freelist_init(&domainlist, domain != NULL);
error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
if (error)
goto out;
set = NULL;
thread_lock(td);
- error = cpuset_shadow(td->td_cpuset, nset, mask);
- if (error == 0) {
- set = td->td_cpuset;
- td->td_cpuset = nset;
- sched_affinity(td);
- nset = NULL;
- }
+ error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
+ &cpusets, &domainlist);
+ if (error == 0)
+ set = cpuset_update_thread(td, nset);
thread_unlock(td);
PROC_UNLOCK(p);
if (set)
cpuset_rel(set);
out:
- if (nset)
- uma_zfree(cpuset_zone, nset);
+ cpuset_freelist_free(&cpusets);
+ domainset_freelist_free(&domainlist);
return (error);
}
/*
+ * Apply an anonymous mask to a single thread.
+ */
+int
+cpuset_setthread(lwpid_t id, cpuset_t *mask)
+{
+
+ return _cpuset_setthread(id, mask, NULL);
+}
+
+/*
* Apply new cpumask to the ithread.
*/
int
cpuset_setithread(lwpid_t id, int cpu)
{
+ struct setlist cpusets;
struct cpuset *nset, *rset;
struct cpuset *parent, *old_set;
struct thread *td;
@@ -738,8 +1248,8 @@ cpuset_setithread(lwpid_t id, int cpu)
cpuset_t mask;
int error;
- nset = uma_zalloc(cpuset_zone, M_WAITOK);
- rset = uma_zalloc(cpuset_zone, M_WAITOK);
+ cpuset_freelist_init(&cpusets, 1);
+ rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
cs_id = CPUSET_INVALID;
CPU_ZERO(&mask);
@@ -756,13 +1266,15 @@ cpuset_setithread(lwpid_t id, int cpu)
old_set = td->td_cpuset;
if (cpu == NOCPU) {
+ nset = LIST_FIRST(&cpusets);
+ LIST_REMOVE(nset, cs_link);
/*
* roll back to default set. We're not using cpuset_shadow()
* here because we can fail CPU_SUBSET() check. This can happen
* if default set does not contain all CPUs.
*/
- error = _cpuset_create(nset, cpuset_default, &mask,
+ error = _cpuset_create(nset, cpuset_default, &mask, NULL,
CPUSET_INVALID);
goto applyset;
@@ -779,7 +1291,7 @@ cpuset_setithread(lwpid_t id, int cpu)
* with any mask.
*/
error = _cpuset_create(rset, cpuset_zero,
- &cpuset_zero->cs_mask, cs_id);
+ &cpuset_zero->cs_mask, NULL, cs_id);
if (error != 0) {
PROC_UNLOCK(p);
goto out;
@@ -794,22 +1306,19 @@ cpuset_setithread(lwpid_t id, int cpu)
old_set = NULL;
}
- error = cpuset_shadow(parent, nset, &mask);
+ error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL);
applyset:
if (error == 0) {
thread_lock(td);
- td->td_cpuset = nset;
- sched_affinity(td);
+ old_set = cpuset_update_thread(td, nset);
thread_unlock(td);
- nset = NULL;
} else
old_set = NULL;
PROC_UNLOCK(p);
if (old_set != NULL)
cpuset_rel(old_set);
out:
- if (nset != NULL)
- uma_zfree(cpuset_zone, nset);
+ cpuset_freelist_free(&cpusets);
if (rset != NULL)
uma_zfree(cpuset_zone, rset);
if (cs_id != CPUSET_INVALID)
@@ -817,6 +1326,25 @@ out:
return (error);
}
+static struct domainset domainset0;
+
+void
+domainset_zero(void)
+{
+ struct domainset *dset;
+ int i;
+
+ mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
+
+ dset = &domainset0;
+ DOMAINSET_ZERO(&dset->ds_mask);
+ for (i = 0; i < vm_ndomains; i++)
+ DOMAINSET_SET(i, &dset->ds_mask);
+ dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
+ dset->ds_prefer = -1;
+ curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
+ kernel_object->domain.dr_policy = curthread->td_domain.dr_policy;
+}
/*
* Creates system-wide cpusets and the cpuset for thread0 including two
@@ -834,11 +1362,12 @@ struct cpuset *
cpuset_thread0(void)
{
struct cpuset *set;
- int error, i;
+ int error;
cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
- mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
+ domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
* Create the root system set for the whole machine. Doesn't use
@@ -850,14 +1379,15 @@ cpuset_thread0(void)
LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
set->cs_ref = 1;
set->cs_flags = CPU_SET_ROOT;
+ set->cs_domain = &domainset0;
cpuset_zero = set;
cpuset_root = &set->cs_mask;
/*
* Now derive a default, modifiable set from that to give out.
*/
- set = uma_zalloc(cpuset_zone, M_WAITOK);
- error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
+ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+ error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
KASSERT(error == 0, ("Error creating default set: %d\n", error));
cpuset_default = set;
@@ -866,16 +1396,6 @@ cpuset_thread0(void)
*/
cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
- /*
- * If MD code has not initialized per-domain cpusets, place all
- * CPUs in domain 0.
- */
- for (i = 0; i < MAXMEMDOM; i++)
- if (!CPU_EMPTY(&cpuset_domain[i]))
- goto domains_set;
- CPU_COPY(&all_cpus, &cpuset_domain[0]);
-domains_set:
-
return (set);
}
@@ -920,7 +1440,7 @@ cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
cpuset_ref(set);
- error = cpuset_setproc(p->p_pid, set, NULL);
+ error = cpuset_setproc(p->p_pid, set, NULL, NULL);
if (error)
return (error);
cpuset_rel(set);
@@ -935,11 +1455,23 @@ static void
cpuset_init(void *arg)
{
cpuset_t mask;
+ int i;
mask = all_cpus;
if (cpuset_modify(cpuset_zero, &mask))
panic("Can't set initial cpuset mask.\n");
cpuset_zero->cs_flags |= CPU_SET_RDONLY;
+
+ /*
+ * If MD code has not initialized per-domain cpusets, place all
+ * CPUs in domain 0.
+ */
+ for (i = 0; i < MAXMEMDOM; i++)
+ if (!CPU_EMPTY(&cpuset_domain[i]))
+ goto domains_set;
+ CPU_COPY(&all_cpus, &cpuset_domain[0]);
+domains_set:
+ return;
}
SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
@@ -964,7 +1496,7 @@ sys_cpuset(struct thread *td, struct cpuset_args *uap)
return (error);
error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
if (error == 0)
- error = cpuset_setproc(-1, set, NULL);
+ error = cpuset_setproc(-1, set, NULL, NULL);
cpuset_rel(set);
return (error);
}
@@ -998,7 +1530,7 @@ kern_cpuset_setid(struct thread *td, cpuwhich_t which,
set = cpuset_lookup(setid, td);
if (set == NULL)
return (ESRCH);
- error = cpuset_setproc(id, set, NULL);
+ error = cpuset_setproc(id, set, NULL, NULL);
cpuset_rel(set);
return (error);
}
@@ -1102,12 +1634,12 @@ kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
return (ERANGE);
/* In Capability mode, you can only get your own CPU set. */
if (IN_CAPABILITY_MODE(td)) {
- if (level != CPU_LEVEL_WHICH)
- return (ECAPMODE);
- if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
- return (ECAPMODE);
- if (id != -1)
- return (ECAPMODE);
+ if (level != CPU_LEVEL_WHICH)
+ return (ECAPMODE);
+ if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
+ return (ECAPMODE);
+ if (id != -1)
+ return (ECAPMODE);
}
size = cpusetsize;
mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
@@ -1219,12 +1751,12 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
return (ERANGE);
/* In Capability mode, you can only set your own CPU set. */
if (IN_CAPABILITY_MODE(td)) {
- if (level != CPU_LEVEL_WHICH)
- return (ECAPMODE);
- if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
- return (ECAPMODE);
- if (id != -1)
- return (ECAPMODE);
+ if (level != CPU_LEVEL_WHICH)
+ return (ECAPMODE);
+ if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
+ return (ECAPMODE);
+ if (id != -1)
+ return (ECAPMODE);
}
mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
error = copyin(maskp, mask, cpusetsize);
@@ -1285,7 +1817,7 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
error = cpuset_setthread(id, mask);
break;
case CPU_WHICH_PID:
- error = cpuset_setproc(id, NULL, mask);
+ error = cpuset_setproc(id, NULL, mask, NULL);
break;
case CPU_WHICH_CPUSET:
case CPU_WHICH_JAIL:
@@ -1314,25 +1846,316 @@ out:
return (error);
}
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getdomain_args {
+ cpulevel_t level;
+ cpuwhich_t which;
+ id_t id;
+ size_t domainsetsize;
+ domainset_t *mask;
+ int *policy;
+};
+#endif
+int
+sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
+{
+
+ return (kern_cpuset_getdomain(td, uap->level, uap->which,
+ uap->id, uap->domainsetsize, uap->mask, uap->policy));
+}
+
+int
+kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
+ id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
+{
+ struct domainset outset;
+ struct thread *ttd;
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct domainset *dset;
+ struct proc *p;
+ domainset_t *mask;
+ int error;
+
+ if (domainsetsize < sizeof(domainset_t) ||
+ domainsetsize > DOMAINSET_MAXSIZE / NBBY)
+ return (ERANGE);
+ /* In Capability mode, you can only get your own domain set. */
+ if (IN_CAPABILITY_MODE(td)) {
+ if (level != CPU_LEVEL_WHICH)
+ return (ECAPMODE);
+ if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
+ return (ECAPMODE);
+ if (id != -1)
+ return (ECAPMODE);
+ }
+ mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
+ bzero(&outset, sizeof(outset));
+ error = cpuset_which(which, id, &p, &ttd, &set);
+ if (error)
+ goto out;
+ switch (level) {
+ case CPU_LEVEL_ROOT:
+ case CPU_LEVEL_CPUSET:
+ switch (which) {
+ case CPU_WHICH_TID:
+ case CPU_WHICH_PID:
+ thread_lock(ttd);
+ set = cpuset_ref(ttd->td_cpuset);
+ thread_unlock(ttd);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ break;
+ case CPU_WHICH_IRQ:
+ case CPU_WHICH_INTRHANDLER:
+ case CPU_WHICH_ITHREAD:
+ case CPU_WHICH_DOMAIN:
+ error = EINVAL;
+ goto out;
+ }
+ if (level == CPU_LEVEL_ROOT)
+ nset = cpuset_refroot(set);
+ else
+ nset = cpuset_refbase(set);
+ domainset_copy(nset->cs_domain, &outset);
+ cpuset_rel(nset);
+ break;
+ case CPU_LEVEL_WHICH:
+ switch (which) {
+ case CPU_WHICH_TID:
+ thread_lock(ttd);
+ domainset_copy(ttd->td_cpuset->cs_domain, &outset);
+ thread_unlock(ttd);
+ break;
+ case CPU_WHICH_PID:
+ FOREACH_THREAD_IN_PROC(p, ttd) {
+ thread_lock(ttd);
+ dset = ttd->td_cpuset->cs_domain;
+ /* Show all domains in the proc. */
+ DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
+ /* Last policy wins. */
+ outset.ds_policy = dset->ds_policy;
+ outset.ds_prefer = dset->ds_prefer;
+ thread_unlock(ttd);
+ }
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ domainset_copy(set->cs_domain, &outset);
+ break;
+ case CPU_WHICH_IRQ:
+ case CPU_WHICH_INTRHANDLER:
+ case CPU_WHICH_ITHREAD:
+ case CPU_WHICH_DOMAIN:
+ error = EINVAL;
+ break;
+ }
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (set)
+ cpuset_rel(set);
+ if (p)
+ PROC_UNLOCK(p);
+ /*
+ * Translate prefer into a set containing only the preferred domain,
+ * not the entire fallback set.
+ */
+ if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
+ DOMAINSET_ZERO(&outset.ds_mask);
+ DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
+ }
+ DOMAINSET_COPY(&outset.ds_mask, mask);
+ if (error == 0)
+ error = copyout(mask, maskp, domainsetsize);
+ if (error == 0)
+ error = copyout(&outset.ds_policy, policyp,
+ sizeof(outset.ds_policy));
+out:
+ free(mask, M_TEMP);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setdomain_args {
+ cpulevel_t level;
+ cpuwhich_t which;
+ id_t id;
+ size_t domainsetsize;
+ domainset_t *mask;
+ int policy;
+};
+#endif
+int
+sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
+{
+
+ return (kern_cpuset_setdomain(td, uap->level, uap->which,
+ uap->id, uap->domainsetsize, uap->mask, uap->policy));
+}
+
+int
+kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
+ id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
+{
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct thread *ttd;
+ struct proc *p;
+ struct domainset domain;
+ domainset_t *mask;
+ int error;
+
+ if (domainsetsize < sizeof(domainset_t) ||
+ domainsetsize > DOMAINSET_MAXSIZE / NBBY)
+ return (ERANGE);
+ /* In Capability mode, you can only set your own CPU set. */
+ if (IN_CAPABILITY_MODE(td)) {
+ if (level != CPU_LEVEL_WHICH)
+ return (ECAPMODE);
+ if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
+ return (ECAPMODE);
+ if (id != -1)
+ return (ECAPMODE);
+ }
+ memset(&domain, 0, sizeof(domain));
+ mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
+ error = copyin(maskp, mask, domainsetsize);
+ if (error)
+ goto out;
+ /*
+ * Verify that no high bits are set.
+ */
+ if (domainsetsize > sizeof(domainset_t)) {
+ char *end;
+ char *cp;
+
+ end = cp = (char *)&mask->__bits;
+ end += domainsetsize;
+ cp += sizeof(domainset_t);
+ while (cp != end)
+ if (*cp++ != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ }
+ DOMAINSET_COPY(mask, &domain.ds_mask);
+ domain.ds_policy = policy;
+ if (policy <= DOMAINSET_POLICY_INVALID ||
+ policy > DOMAINSET_POLICY_MAX)
+ return (EINVAL);
+
+ /* Translate preferred policy into a mask and fallback. */
+ if (policy == DOMAINSET_POLICY_PREFER) {
+ /* Only support a single preferred domain. */
+ if (DOMAINSET_COUNT(&domain.ds_mask) != 1)
+ return (EINVAL);
+ domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
+ /* This will be constrained by domainset_shadow(). */
+ DOMAINSET_FILL(&domain.ds_mask);
+ }
+
+ switch (level) {
+ case CPU_LEVEL_ROOT:
+ case CPU_LEVEL_CPUSET:
+ error = cpuset_which(which, id, &p, &ttd, &set);
+ if (error)
+ break;
+ switch (which) {
+ case CPU_WHICH_TID:
+ case CPU_WHICH_PID:
+ thread_lock(ttd);
+ set = cpuset_ref(ttd->td_cpuset);
+ thread_unlock(ttd);
+ PROC_UNLOCK(p);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ break;
+ case CPU_WHICH_IRQ:
+ case CPU_WHICH_INTRHANDLER:
+ case CPU_WHICH_ITHREAD:
+ case CPU_WHICH_DOMAIN:
+ error = EINVAL;
+ goto out;
+ }
+ if (level == CPU_LEVEL_ROOT)
+ nset = cpuset_refroot(set);
+ else
+ nset = cpuset_refbase(set);
+ error = cpuset_modify_domain(nset, &domain);
+ cpuset_rel(nset);
+ cpuset_rel(set);
+ break;
+ case CPU_LEVEL_WHICH:
+ switch (which) {
+ case CPU_WHICH_TID:
+ error = _cpuset_setthread(id, NULL, &domain);
+ break;
+ case CPU_WHICH_PID:
+ error = cpuset_setproc(id, NULL, NULL, &domain);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ error = cpuset_which(which, id, &p, &ttd, &set);
+ if (error == 0) {
+ error = cpuset_modify_domain(set, &domain);
+ cpuset_rel(set);
+ }
+ break;
+ case CPU_WHICH_IRQ:
+ case CPU_WHICH_INTRHANDLER:
+ case CPU_WHICH_ITHREAD:
+ default:
+ error = EINVAL;
+ break;
+ }
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+out:
+ free(mask, M_TEMP);
+ return (error);
+}
+
#ifdef DDB
-void
-ddb_display_cpuset(const cpuset_t *set)
+BITSET_DEFINE(bitset, 1);
+static void
+ddb_display_bitset(const struct bitset *set, int size)
{
- int cpu, once;
+ int bit, once;
- for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
- if (CPU_ISSET(cpu, set)) {
+ for (once = 0, bit = 0; bit < size; bit++) {
+ if (CPU_ISSET(bit, set)) {
if (once == 0) {
- db_printf("%d", cpu);
+ db_printf("%d", bit);
once = 1;
} else
- db_printf(",%d", cpu);
+ db_printf(",%d", bit);
}
}
if (once == 0)
db_printf("<none>");
}
+void
+ddb_display_cpuset(const cpuset_t *set)
+{
+ ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
+}
+
+static void
+ddb_display_domainset(const domainset_t *set)
+{
+ ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
+}
+
DB_SHOW_COMMAND(cpusets, db_show_cpusets)
{
struct cpuset *set;
@@ -1341,11 +2164,29 @@ DB_SHOW_COMMAND(cpusets, db_show_cpusets)
db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
set, set->cs_id, set->cs_ref, set->cs_flags,
(set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
- db_printf(" mask=");
+ db_printf(" cpu mask=");
ddb_display_cpuset(&set->cs_mask);
db_printf("\n");
+ db_printf(" domain policy %d prefer %d mask=",
+ set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
+ ddb_display_domainset(&set->cs_domain->ds_mask);
+ db_printf("\n");
if (db_pager_quit)
break;
}
}
+
+DB_SHOW_COMMAND(domainsets, db_show_domainsets)
+{
+ struct domainset *set;
+
+ LIST_FOREACH(set, &cpuset_domains, ds_link) {
+ db_printf("set=%p policy %d prefer %d cnt %d max %d\n",
+ set, set->ds_policy, set->ds_prefer, set->ds_cnt,
+ set->ds_max);
+ db_printf(" mask =");
+ ddb_display_domainset(&set->ds_mask);
+ db_printf("\n");
+ }
+}
#endif /* DDB */
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 12a7b5d5a6d5..9db57f58192a 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -88,7 +88,6 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/uma.h>
-#include <vm/vm_domain.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
@@ -931,10 +930,6 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options)
#ifdef MAC
mac_proc_destroy(p);
#endif
- /*
- * Free any domain policy that's still hiding around.
- */
- vm_domain_policy_cleanup(&p->p_vm_dom_policy);
KASSERT(FIRST_THREAD_IN_PROC(p),
("proc_reap: no residual thread!"));
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 0367cdd7f187..804409e2f3e5 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -83,7 +83,6 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
-#include <vm/vm_domain.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
@@ -512,14 +511,6 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
- /*
- * Whilst the proc lock is held, copy the VM domain data out
- * using the VM domain method.
- */
- vm_domain_policy_init(&p2->p_vm_dom_policy);
- vm_domain_policy_localcopy(&p2->p_vm_dom_policy,
- &p1->p_vm_dom_policy);
-
if (fr->fr_flags & RFSIGSHARE) {
p2->p_sigacts = sigacts_hold(p1->p_sigacts);
} else {
diff --git a/sys/kern/kern_numa.c b/sys/kern/kern_numa.c
deleted file mode 100644
index 2d3ec49c15eb..000000000000
--- a/sys/kern/kern_numa.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*-
- * Copyright (c) 2015, Adrian Chadd <adrian@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice unmodified, this list of conditions, and the following
- * disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysproto.h>
-#include <sys/jail.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/refcount.h>
-#include <sys/sched.h>
-#include <sys/smp.h>
-#include <sys/syscallsubr.h>
-#include <sys/cpuset.h>
-#include <sys/sx.h>
-#include <sys/queue.h>
-#include <sys/libkern.h>
-#include <sys/limits.h>
-#include <sys/bus.h>
-#include <sys/interrupt.h>
-
-#include <vm/uma.h>
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_param.h>
-#include <vm/vm_domain.h>
-
-int
-sys_numa_setaffinity(struct thread *td, struct numa_setaffinity_args *uap)
-{
- int error;
- struct vm_domain_policy vp;
- struct thread *ttd;
- struct proc *p;
- struct cpuset *set;
-
- set = NULL;
- p = NULL;
-
- /*
- * Copy in just the policy information into the policy
- * struct. Userland only supplies vm_domain_policy_entry.
- */
- error = copyin(uap->policy, &vp.p, sizeof(vp.p));
- if (error)
- goto out;
-
- /*
- * Ensure the seq number is zero - otherwise seq.h
- * may get very confused.
- */
- vp.seq = 0;
-
- /*
- * Validate policy.
- */
- if (vm_domain_policy_validate(&vp) != 0) {
- error = EINVAL;
- goto out;
- }
-
- /*
- * Go find the desired proc/tid for this operation.
- */
- error = cpuset_which(uap->which, uap->id, &p,
- &ttd, &set);
- if (error)
- goto out;
-
- /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */
- /*
- * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset,
- * it'll return ESRCH. We should just return EINVAL.
- */
- switch (uap->which) {
- case CPU_WHICH_TID:
- vm_domain_policy_copy(&ttd->td_vm_dom_policy, &vp);
- break;
- case CPU_WHICH_PID:
- vm_domain_policy_copy(&p->p_vm_dom_policy, &vp);
- break;
- default:
- error = EINVAL;
- break;
- }
-
- PROC_UNLOCK(p);
-out:
- if (set)
- cpuset_rel(set);
- return (error);
-}
-
-int
-sys_numa_getaffinity(struct thread *td, struct numa_getaffinity_args *uap)
-{
- int error;
- struct vm_domain_policy vp;
- struct thread *ttd;
- struct proc *p;
- struct cpuset *set;
-
- set = NULL;
- p = NULL;
-
- error = cpuset_which(uap->which, uap->id, &p,
- &ttd, &set);
- if (error)
- goto out;
-
- /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */
- /*
- * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset,
- * it'll return ESRCH. We should just return EINVAL.
- */
- switch (uap->which) {
- case CPU_WHICH_TID:
- vm_domain_policy_localcopy(&vp, &ttd->td_vm_dom_policy);
- break;
- case CPU_WHICH_PID:
- vm_domain_policy_localcopy(&vp, &p->p_vm_dom_policy);
- break;
- default:
- error = EINVAL;
- break;
- }
- if (p)
- PROC_UNLOCK(p);
- /*
- * Copy out only the vm_domain_policy_entry part.
- */
- if (error == 0)
- error = copyout(&vp.p, uap->policy, sizeof(vp.p));
-out:
- if (set)
- cpuset_rel(set);
- return (error);
-}
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 28ef5242e437..29523d9a0fa4 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -57,8 +57,6 @@ __FBSDID("$FreeBSD$");
#include <sys/umtx.h>
#include <sys/limits.h>
-#include <vm/vm_domain.h>
-
#include <machine/frame.h>
#include <security/audit/audit.h>
@@ -260,12 +258,6 @@ thread_create(struct thread *td, struct rtprio *rtp,
if (p->p_ptevents & PTRACE_LWP)
newtd->td_dbgflags |= TDB_BORN;
- /*
- * Copy the existing thread VM policy into the new thread.
- */
- vm_domain_policy_localcopy(&newtd->td_vm_dom_policy,
- &td->td_vm_dom_policy);
-
PROC_UNLOCK(p);
tidhash_add(newtd);
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index e932dcb9a545..d992f2c242aa 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -64,7 +64,6 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
-#include <vm/vm_domain.h>
#include <sys/eventhandler.h>
/*
@@ -78,13 +77,13 @@ __FBSDID("$FreeBSD$");
* structures.
*/
#ifdef __amd64__
-_Static_assert(offsetof(struct thread, td_flags) == 0xf4,
+_Static_assert(offsetof(struct thread, td_flags) == 0xfc,
"struct thread KBI td_flags");
-_Static_assert(offsetof(struct thread, td_pflags) == 0xfc,
+_Static_assert(offsetof(struct thread, td_pflags) == 0x104,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x460,
+_Static_assert(offsetof(struct thread, td_frame) == 0x468,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x508,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x510,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
"struct proc KBI p_flag");
@@ -98,13 +97,13 @@ _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8,
"struct proc KBI p_emuldata");
#endif
#ifdef __i386__
-_Static_assert(offsetof(struct thread, td_flags) == 0x9c,
+_Static_assert(offsetof(struct thread, td_flags) == 0x98,
"struct thread KBI td_flags");
-_Static_assert(offsetof(struct thread, td_pflags) == 0xa4,
+_Static_assert(offsetof(struct thread, td_pflags) == 0xa0,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x2ec,
+_Static_assert(offsetof(struct thread, td_frame) == 0x2e4,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x338,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x330,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0x68,
"struct proc KBI p_flag");
@@ -413,7 +412,6 @@ thread_alloc(int pages)
return (NULL);
}
cpu_thread_alloc(td);
- vm_domain_policy_init(&td->td_vm_dom_policy);
return (td);
}
@@ -443,7 +441,6 @@ thread_free(struct thread *td)
cpu_thread_free(td);
if (td->td_kstack != 0)
vm_thread_dispose(td);
- vm_domain_policy_cleanup(&td->td_vm_dom_policy);
callout_drain(&td->td_slpcallout);
uma_zfree(thread_zone, td);
}
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
index 7ea37de6e53c..22b6e6a57de2 100644
--- a/sys/kern/makesyscalls.sh
+++ b/sys/kern/makesyscalls.sh
@@ -139,6 +139,7 @@ sed -e '
printf "#include <sys/signal.h>\n" > sysarg
printf "#include <sys/acl.h>\n" > sysarg
printf "#include <sys/cpuset.h>\n" > sysarg
+ printf "#include <sys/domainset.h>\n" > sysarg
printf "#include <sys/_ffcounter.h>\n" > sysarg
printf "#include <sys/_semaphore.h>\n" > sysarg
printf "#include <sys/ucontext.h>\n" > sysarg
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index f6b124d865cb..740a5f6626da 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -781,6 +781,7 @@ sched_fork_thread(struct thread *td, struct thread *childtd)
childtd->td_lastcpu = NOCPU;
childtd->td_lock = &sched_lock;
childtd->td_cpuset = cpuset_ref(td->td_cpuset);
+ childtd->td_domain.dr_policy = td->td_cpuset->cs_domain;
childtd->td_priority = childtd->td_base_pri;
ts = td_get_sched(childtd);
bzero(ts, sizeof(*ts));
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index e1d2b5f34b60..cf861366fbd9 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -2131,6 +2131,7 @@ sched_fork_thread(struct thread *td, struct thread *child)
child->td_lastcpu = NOCPU;
child->td_lock = TDQ_LOCKPTR(tdq);
child->td_cpuset = cpuset_ref(td->td_cpuset);
+ child->td_domain.dr_policy = td->td_cpuset->cs_domain;
ts2->ts_cpu = ts->ts_cpu;
ts2->ts_flags = 0;
/*
diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c
index 122477e9cc66..a41360a06121 100644
--- a/sys/kern/subr_kdb.c
+++ b/sys/kern/subr_kdb.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
+#include <sys/lock.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index b3785c6c7432..ade8b9cb58a0 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -997,12 +997,8 @@
547 AUE_FUTIMESAT STD { int utimensat(int fd, \
char *path, \
struct timespec *times, int flag); }
-548 AUE_NULL STD { int numa_getaffinity(cpuwhich_t which, \
- id_t id, \
- struct vm_domain_policy_entry *policy); }
-549 AUE_NULL STD { int numa_setaffinity(cpuwhich_t which, \
- id_t id, const struct \
- vm_domain_policy_entry *policy); }
+548 AUE_NULL UNIMPL numa_getaffinity
+549 AUE_NULL UNIMPL numa_setaffinity
550 AUE_FSYNC STD { int fdatasync(int fd); }
551 AUE_FSTAT STD { int fstat(int fd, struct stat *sb); }
552 AUE_FSTATAT STD { int fstatat(int fd, char *path, \
@@ -1023,6 +1019,14 @@
struct kevent *changelist, int nchanges, \
struct kevent *eventlist, int nevents, \
const struct timespec *timeout); }
+561 AUE_NULL STD { int cpuset_getdomain(cpulevel_t level, \
+ cpuwhich_t which, id_t id, \
+ size_t domainsetsize, domainset_t *mask, \
+ int *policy); }
+562 AUE_NULL STD { int cpuset_setdomain(cpulevel_t level, \
+ cpuwhich_t which, id_t id, \
+ size_t domainsetsize, domainset_t *mask, \
+ int policy); }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.c b/sys/netpfil/ipfw/dn_sched_fq_codel.c
index cc5dc5a17203..44610aaf9740 100644
--- a/sys/netpfil/ipfw/dn_sched_fq_codel.c
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.c
@@ -44,6 +44,7 @@
#include <netinet/ip_fw.h> /* flow_id */
#include <netinet/ip_dummynet.h>
+#include <sys/lock.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
diff --git a/sys/sys/_vm_domain.h b/sys/sys/_vm_domain.h
deleted file mode 100644
index c34d737c9f6a..000000000000
--- a/sys/sys/_vm_domain.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer,
- * without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
- * redistribution must be conditioned upon including a substantially
- * similar Disclaimer requirement for further binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
- * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGES.
- *
- * $FreeBSD$
- */
-#ifndef __SYS_VM_DOMAIN_H__
-#define __SYS_VM_DOMAIN_H__
-
-#include <sys/seq.h>
-
-typedef enum {
- VM_POLICY_NONE,
- VM_POLICY_ROUND_ROBIN,
- VM_POLICY_FIXED_DOMAIN,
- VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN,
- VM_POLICY_FIRST_TOUCH,
- VM_POLICY_FIRST_TOUCH_ROUND_ROBIN,
- VM_POLICY_MAX
-} vm_domain_policy_type_t;
-
-struct vm_domain_policy_entry {
- vm_domain_policy_type_t policy;
- int domain;
-};
-
-struct vm_domain_policy {
- seq_t seq;
- struct vm_domain_policy_entry p;
-};
-
-#define VM_DOMAIN_POLICY_STATIC_INITIALISER(vt, vd) \
- { .seq = 0, \
- .p.policy = vt, \
- .p.domain = vd }
-
-#endif /* __SYS_VM_DOMAIN_H__ */
diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h
index 6ae989bbbe12..727209be76bf 100644
--- a/sys/sys/cpuset.h
+++ b/sys/sys/cpuset.h
@@ -112,6 +112,7 @@ LIST_HEAD(setlist, cpuset);
*/
struct cpuset {
cpuset_t cs_mask; /* bitmask of valid cpus. */
+ struct domainset *cs_domain; /* (c) NUMA policy. */
volatile u_int cs_ref; /* (a) Reference count. */
int cs_flags; /* (s) Flags from below. */
cpusetid_t cs_id; /* (s) Id or INVALID. */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index b5f2e7719f4e..9bc75db8591a 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -62,11 +62,18 @@
#include <sys/time.h> /* For structs itimerval, timeval. */
#else
#include <sys/pcpu.h>
+#include <sys/systm.h>
#endif
#include <sys/ucontext.h>
#include <sys/ucred.h>
-#include <sys/_vm_domain.h>
+#include <sys/types.h>
+#include <sys/domainset.h>
+
#include <machine/proc.h> /* Machine-dependent proc substruct. */
+#ifdef _KERNEL
+#include <machine/cpu.h>
+#endif
+
/*
* One structure allocated per session.
@@ -179,6 +186,7 @@ struct procdesc;
struct racct;
struct sbuf;
struct sleepqueue;
+struct socket;
struct syscall_args;
struct td_sched;
struct thread;
@@ -222,12 +230,12 @@ struct thread {
TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */
LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */
struct cpuset *td_cpuset; /* (t) CPU affinity mask. */
+ struct domainset_ref td_domain; /* (a) NUMA policy */
struct seltd *td_sel; /* Select queue/channel. */
struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
struct turnstile *td_turnstile; /* (k) Associated turnstile. */
struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */
struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */
- struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */
lwpid_t td_tid; /* (b) Thread ID. */
sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */
#define td_siglist td_sigqueue.sq_signals
@@ -286,7 +294,6 @@ struct thread {
pid_t td_dbg_forked; /* (c) Child pid for debugger. */
u_int td_vp_reserv; /* (k) Count of reserved vnodes. */
int td_no_sleeping; /* (k) Sleeping disabled count. */
- int td_dom_rr_idx; /* (k) RR Numa domain selection. */
void *td_su; /* (k) FFS SU private */
sbintime_t td_sleeptimo; /* (t) Sleep timeout. */
int td_rtcgen; /* (s) rtc_generation of abs. sleep */
@@ -655,7 +662,6 @@ struct proc {
uint64_t p_prev_runtime; /* (c) Resource usage accounting. */
struct racct *p_racct; /* (b) Resource accounting. */
int p_throttled; /* (c) Flag for racct pcpu throttling */
- struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */
/*
* An orphan is the child that has beed re-parented to the
* debugger as a result of attaching to it. Need to keep
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index d95b2dd7d4bf..a0f8fc0be384 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -36,6 +36,7 @@
#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/_cpuset.h>
+#include <sys/_domainset.h>
struct file;
struct filecaps;
@@ -96,6 +97,12 @@ int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
cpuwhich_t which, id_t id, size_t cpusetsize,
const cpuset_t *maskp);
+int kern_cpuset_getdomain(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, size_t domainsetsize,
+ domainset_t *maskp, int *policyp);
+int kern_cpuset_setdomain(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, size_t domainsetsize,
+ const domainset_t *maskp, int policy);
int kern_cpuset_getid(struct thread *td, cpulevel_t level,
cpuwhich_t which, id_t id, cpusetid_t *setid);
int kern_cpuset_setid(struct thread *td, cpuwhich_t which,
diff --git a/sys/vm/vm_domain.c b/sys/vm/vm_domain.c
deleted file mode 100644
index 9fe44168cfad..000000000000
--- a/sys/vm/vm_domain.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer,
- * without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
- * redistribution must be conditioned upon including a substantially
- * similar Disclaimer requirement for further binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
- * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGES.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_vm.h"
-#include "opt_ddb.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/lock.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#ifdef VM_NUMA_ALLOC
-#include <sys/proc.h>
-#endif
-#include <sys/queue.h>
-#include <sys/rwlock.h>
-#include <sys/sbuf.h>
-#include <sys/sysctl.h>
-#include <sys/tree.h>
-#include <sys/vmmeter.h>
-#include <sys/seq.h>
-
-#include <ddb/ddb.h>
-
-#include <vm/vm.h>
-#include <vm/vm_param.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_object.h>
-#include <vm/vm_page.h>
-#include <vm/vm_phys.h>
-
-#include <vm/vm_domain.h>
-
-/*
- * Default to first-touch + round-robin.
- */
-static struct mtx vm_default_policy_mtx;
-MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
- MTX_DEF);
-#ifdef VM_NUMA_ALLOC
-static struct vm_domain_policy vm_default_policy =
- VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
-#else
-/* Use round-robin so the domain policy code will only try once per allocation */
-static struct vm_domain_policy vm_default_policy =
- VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
-#endif
-
-static int
-sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
-{
- char policy_name[32];
- int error;
-
- mtx_lock(&vm_default_policy_mtx);
-
- /* Map policy to output string */
- switch (vm_default_policy.p.policy) {
- case VM_POLICY_FIRST_TOUCH:
- strcpy(policy_name, "first-touch");
- break;
- case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
- strcpy(policy_name, "first-touch-rr");
- break;
- case VM_POLICY_ROUND_ROBIN:
- default:
- strcpy(policy_name, "rr");
- break;
- }
- mtx_unlock(&vm_default_policy_mtx);
-
- error = sysctl_handle_string(oidp, &policy_name[0],
- sizeof(policy_name), req);
- if (error != 0 || req->newptr == NULL)
- return (error);
-
- mtx_lock(&vm_default_policy_mtx);
- /* Set: match on the subset of policies that make sense as a default */
- if (strcmp("first-touch-rr", policy_name) == 0) {
- vm_domain_policy_set(&vm_default_policy,
- VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
- } else if (strcmp("first-touch", policy_name) == 0) {
- vm_domain_policy_set(&vm_default_policy,
- VM_POLICY_FIRST_TOUCH, 0);
- } else if (strcmp("rr", policy_name) == 0) {
- vm_domain_policy_set(&vm_default_policy,
- VM_POLICY_ROUND_ROBIN, 0);
- } else {
- error = EINVAL;
- goto finish;
- }
-
- error = 0;
-finish:
- mtx_unlock(&vm_default_policy_mtx);
- return (error);
-}
-
-SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
- 0, 0, sysctl_vm_default_policy, "A",
- "Default policy (rr, first-touch, first-touch-rr");
-
-/*
- * Initialise a VM domain iterator.
- *
- * Check the thread policy, then the proc policy,
- * then default to the system policy.
- */
-void
-vm_policy_iterator_init(struct vm_domain_iterator *vi)
-{
-#ifdef VM_NUMA_ALLOC
- struct vm_domain_policy lcl;
-#endif
-
- vm_domain_iterator_init(vi);
-
-#ifdef VM_NUMA_ALLOC
- /* Copy out the thread policy */
- vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
- if (lcl.p.policy != VM_POLICY_NONE) {
- /* Thread policy is present; use it */
- vm_domain_iterator_set_policy(vi, &lcl);
- return;
- }
-
- vm_domain_policy_localcopy(&lcl,
- &curthread->td_proc->p_vm_dom_policy);
- if (lcl.p.policy != VM_POLICY_NONE) {
- /* Process policy is present; use it */
- vm_domain_iterator_set_policy(vi, &lcl);
- return;
- }
-#endif
- /* Use system default policy */
- vm_domain_iterator_set_policy(vi, &vm_default_policy);
-}
-
-void
-vm_policy_iterator_finish(struct vm_domain_iterator *vi)
-{
-
- vm_domain_iterator_cleanup(vi);
-}
-
-#ifdef VM_NUMA_ALLOC
-static __inline int
-vm_domain_rr_selectdomain(int skip_domain)
-{
- struct thread *td;
-
- td = curthread;
-
- td->td_dom_rr_idx++;
- td->td_dom_rr_idx %= vm_ndomains;
-
- /*
- * If skip_domain is provided then skip over that
- * domain. This is intended for round robin variants
- * which first try a fixed domain.
- */
- if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) {
- td->td_dom_rr_idx++;
- td->td_dom_rr_idx %= vm_ndomains;
- }
- return (td->td_dom_rr_idx);
-}
-#endif
-
-/*
- * This implements a very simple set of VM domain memory allocation
- * policies and iterators.
- */
-
-/*
- * A VM domain policy represents a desired VM domain policy.
- * Iterators implement searching through VM domains in a specific
- * order.
- */
-
-/*
- * When setting a policy, the caller must establish their own
- * exclusive write protection for the contents of the domain
- * policy.
- */
-int
-vm_domain_policy_init(struct vm_domain_policy *vp)
-{
-
- bzero(vp, sizeof(*vp));
- vp->p.policy = VM_POLICY_NONE;
- vp->p.domain = -1;
- return (0);
-}
-
-int
-vm_domain_policy_set(struct vm_domain_policy *vp,
- vm_domain_policy_type_t vt, int domain)
-{
-
- seq_write_begin(&vp->seq);
- vp->p.policy = vt;
- vp->p.domain = domain;
- seq_write_end(&vp->seq);
- return (0);
-}
-
-/*
- * Take a local copy of a policy.
- *
- * The destination policy isn't write-barriered; this is used
- * for doing local copies into something that isn't shared.
- */
-void
-vm_domain_policy_localcopy(struct vm_domain_policy *dst,
- const struct vm_domain_policy *src)
-{
- seq_t seq;
-
- for (;;) {
- seq = seq_read(&src->seq);
- *dst = *src;
- if (seq_consistent(&src->seq, seq))
- return;
- }
-}
-
-/*
- * Take a write-barrier copy of a policy.
- *
- * The destination policy is write -barriered; this is used
- * for doing copies into policies that may be read by other
- * threads.
- */
-void
-vm_domain_policy_copy(struct vm_domain_policy *dst,
- const struct vm_domain_policy *src)
-{
- seq_t seq;
- struct vm_domain_policy d;
-
- for (;;) {
- seq = seq_read(&src->seq);
- d = *src;
- if (seq_consistent(&src->seq, seq)) {
- seq_write_begin(&dst->seq);
- dst->p.domain = d.p.domain;
- dst->p.policy = d.p.policy;
- seq_write_end(&dst->seq);
- return;
- }
- }
-}
-
-int
-vm_domain_policy_validate(const struct vm_domain_policy *vp)
-{
-
- switch (vp->p.policy) {
- case VM_POLICY_NONE:
- case VM_POLICY_ROUND_ROBIN:
- case VM_POLICY_FIRST_TOUCH:
- case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
- if (vp->p.domain == -1)
- return (0);
- return (-1);
- case VM_POLICY_FIXED_DOMAIN:
- case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
-#ifdef VM_NUMA_ALLOC
- if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains)
- return (0);
-#else
- if (vp->p.domain == 0)
- return (0);
-#endif
- return (-1);
- default:
- return (-1);
- }
- return (-1);
-}
-
-int
-vm_domain_policy_cleanup(struct vm_domain_policy *vp)
-{
-
- /* For now, empty */
- return (0);
-}
-
-int
-vm_domain_iterator_init(struct vm_domain_iterator *vi)
-{
-
- /* Nothing to do for now */
- return (0);
-}
-
-/*
- * Manually setup an iterator with the given details.
- */
-int
-vm_domain_iterator_set(struct vm_domain_iterator *vi,
- vm_domain_policy_type_t vt, int domain)
-{
-
-#ifdef VM_NUMA_ALLOC
- switch (vt) {
- case VM_POLICY_FIXED_DOMAIN:
- vi->policy = VM_POLICY_FIXED_DOMAIN;
- vi->domain = domain;
- vi->n = 1;
- break;
- case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
- vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN;
- vi->domain = domain;
- vi->n = vm_ndomains;
- break;
- case VM_POLICY_FIRST_TOUCH:
- vi->policy = VM_POLICY_FIRST_TOUCH;
- vi->domain = PCPU_GET(domain);
- vi->n = 1;
- break;
- case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
- vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN;
- vi->domain = PCPU_GET(domain);
- vi->n = vm_ndomains;
- break;
- case VM_POLICY_ROUND_ROBIN:
- default:
- vi->policy = VM_POLICY_ROUND_ROBIN;
- vi->domain = -1;
- vi->n = vm_ndomains;
- break;
- }
-#else
- vi->domain = 0;
- vi->n = 1;
-#endif
- return (0);
-}
-
-/*
- * Setup an iterator based on the given policy.
- */
-static inline void
-_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
- const struct vm_domain_policy *vt)
-{
-
-#ifdef VM_NUMA_ALLOC
- /*
- * Initialise the iterator.
- *
- * For first-touch, the initial domain is set
- * via the current thread CPU domain.
- *
- * For fixed-domain, it's assumed that the
- * caller has initialised the specific domain
- * it is after.
- */
- switch (vt->p.policy) {
- case VM_POLICY_FIXED_DOMAIN:
- vi->policy = vt->p.policy;
- vi->domain = vt->p.domain;
- vi->n = 1;
- break;
- case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
- vi->policy = vt->p.policy;
- vi->domain = vt->p.domain;
- vi->n = vm_ndomains;
- break;
- case VM_POLICY_FIRST_TOUCH:
- vi->policy = vt->p.policy;
- vi->domain = PCPU_GET(domain);
- vi->n = 1;
- break;
- case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
- vi->policy = vt->p.policy;
- vi->domain = PCPU_GET(domain);
- vi->n = vm_ndomains;
- break;
- case VM_POLICY_ROUND_ROBIN:
- default:
- /*
- * Default to round-robin policy.
- */
- vi->policy = VM_POLICY_ROUND_ROBIN;
- vi->domain = -1;
- vi->n = vm_ndomains;
- break;
- }
-#else
- vi->domain = 0;
- vi->n = 1;
-#endif
-}
-
-void
-vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
- const struct vm_domain_policy *vt)
-{
- seq_t seq;
- struct vm_domain_policy vt_lcl;
-
- for (;;) {
- seq = seq_read(&vt->seq);
- vt_lcl = *vt;
- if (seq_consistent(&vt->seq, seq)) {
- _vm_domain_iterator_set_policy(vi, &vt_lcl);
- return;
- }
- }
-}
-
-/*
- * Return the next VM domain to use.
- *
- * Returns 0 w/ domain set to the next domain to use, or
- * -1 to indicate no more domains are available.
- */
-int
-vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
-{
-
- /* General catch-all */
- if (vi->n <= 0)
- return (-1);
-
-#ifdef VM_NUMA_ALLOC
- switch (vi->policy) {
- case VM_POLICY_FIXED_DOMAIN:
- case VM_POLICY_FIRST_TOUCH:
- *domain = vi->domain;
- vi->n--;
- break;
- case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
- case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
- /*
- * XXX TODO: skip over the rr'ed domain
- * if it equals the one we started with.
- */
- if (vi->n == vm_ndomains)
- *domain = vi->domain;
- else
- *domain = vm_domain_rr_selectdomain(vi->domain);
- vi->n--;
- break;
- case VM_POLICY_ROUND_ROBIN:
- default:
- *domain = vm_domain_rr_selectdomain(-1);
- vi->n--;
- break;
- }
-#else
- *domain = 0;
- vi->n--;
-#endif
-
- return (0);
-}
-
-/*
- * Returns 1 if the iteration is done, or 0 if it has not.
-
- * This can only be called after at least one loop through
- * the iterator. Ie, it's designed to be used as a tail
- * check of a loop, not the head check of a loop.
- */
-int
-vm_domain_iterator_isdone(struct vm_domain_iterator *vi)
-{
-
- return (vi->n <= 0);
-}
-
-int
-vm_domain_iterator_cleanup(struct vm_domain_iterator *vi)
-{
-
- return (0);
-}
diff --git a/sys/vm/vm_domain.h b/sys/vm/vm_domain.h
deleted file mode 100644
index 3b99c43c9101..000000000000
--- a/sys/vm/vm_domain.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer,
- * without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
- * redistribution must be conditioned upon including a substantially
- * similar Disclaimer requirement for further binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
- * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGES.
- *
- * $FreeBSD$
- */
-#ifndef __VM_DOMAIN_H__
-#define __VM_DOMAIN_H__
-
-#include <sys/_vm_domain.h>
-
-struct vm_domain_iterator {
- vm_domain_policy_type_t policy;
- int domain;
- int n;
-};
-
-/*
- * TODO: check to see if these should just become inline functions
- * at some point.
- */
-extern int vm_domain_policy_init(struct vm_domain_policy *vp);
-extern int vm_domain_policy_set(struct vm_domain_policy *vp,
- vm_domain_policy_type_t vt, int domain);
-extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp);
-extern void vm_domain_policy_localcopy(struct vm_domain_policy *dst,
- const struct vm_domain_policy *src);
-extern void vm_domain_policy_copy(struct vm_domain_policy *dst,
- const struct vm_domain_policy *src);
-extern int vm_domain_policy_validate(const struct vm_domain_policy *vp);
-
-extern int vm_domain_iterator_init(struct vm_domain_iterator *vi);
-extern int vm_domain_iterator_set(struct vm_domain_iterator *vi,
- vm_domain_policy_type_t vt, int domain);
-extern void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
- const struct vm_domain_policy *vt);
-extern int vm_domain_iterator_run(struct vm_domain_iterator *vi,
- int *domain);
-extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi);
-extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi);
-
-extern void vm_policy_iterator_init(struct vm_domain_iterator *vi);
-extern void vm_policy_iterator_finish(struct vm_domain_iterator *vi);
-
-#endif /* __VM_DOMAIN_H__ */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index ece496407c2c..83e12a588ee7 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -1589,6 +1589,7 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
KASSERT(upgrade || dst_entry->object.vm_object == NULL,
("vm_fault_copy_entry: vm_object not NULL"));
if (src_object != dst_object) {
+ dst_object->domain = src_object->domain;
dst_entry->object.vm_object = dst_object;
dst_entry->offset = 0;
dst_object->charge = dst_entry->end - dst_entry->start;
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 01be7db38a99..a3495f9861b0 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/cpuset.h>
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/mount.h>
@@ -1364,6 +1365,7 @@ vm_object_shadow(
result->backing_object_offset = *offset;
if (source != NULL) {
VM_OBJECT_WLOCK(source);
+ result->domain = source->domain;
LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
source->shadow_count++;
#if VM_NRESERVLEVEL > 0
@@ -1419,6 +1421,7 @@ vm_object_split(vm_map_entry_t entry)
*/
VM_OBJECT_WLOCK(new_object);
VM_OBJECT_WLOCK(orig_object);
+ new_object->domain = orig_object->domain;
source = orig_object->backing_object;
if (source != NULL) {
VM_OBJECT_WLOCK(source);
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 1edf2d59868d..1e3744ffe24f 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -74,6 +74,7 @@
#include <sys/_mutex.h>
#include <sys/_pctrie.h>
#include <sys/_rwlock.h>
+#include <sys/_domainset.h>
#include <vm/_vm_radix.h>
@@ -102,6 +103,7 @@ struct vm_object {
struct pglist memq; /* list of resident pages */
struct vm_radix rtree; /* root of the resident page radix trie*/
vm_pindex_t size; /* Object size */
+ struct domainset_ref domain; /* NUMA policy. */
int generation; /* generation ID */
int ref_count; /* How many refs?? */
int shadow_count; /* how many objects that this is a shadow for */
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 18a795ec0de8..6635d17a1ef9 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -91,6 +91,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
+#include <sys/domainset.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/linker.h>
@@ -109,7 +110,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_param.h>
-#include <vm/vm_domain.h>
+#include <vm/vm_domainset.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
@@ -742,6 +743,12 @@ vm_page_startup(vm_offset_t vaddr)
*/
vm_reserv_init();
#endif
+ /*
+ * Set an initial domain policy for thread0 so that allocations
+ * can work.
+ */
+ domainset_zero();
+
return (vaddr);
}
@@ -1622,23 +1629,17 @@ vm_page_t
vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex,
int req, vm_page_t mpred)
{
- struct vm_domain_iterator vi;
+ struct vm_domainset_iter di;
vm_page_t m;
- int domain, wait;
+ int domain;
- m = NULL;
- vm_policy_iterator_init(&vi);
- wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK);
- req &= ~wait;
- while (vm_domain_iterator_run(&vi, &domain) == 0) {
- if (vm_domain_iterator_isdone(&vi))
- req |= wait;
+ vm_domainset_iter_page_init(&di, object, &domain, &req);
+ do {
m = vm_page_alloc_domain_after(object, pindex, domain, req,
mpred);
if (m != NULL)
break;
- }
- vm_policy_iterator_finish(&vi);
+ } while (vm_domainset_iter_page(&di, &domain, &req) == 0);
return (m);
}
@@ -1835,23 +1836,17 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
vm_paddr_t boundary, vm_memattr_t memattr)
{
- struct vm_domain_iterator vi;
+ struct vm_domainset_iter di;
vm_page_t m;
- int domain, wait;
+ int domain;
- m = NULL;
- vm_policy_iterator_init(&vi);
- wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK);
- req &= ~wait;
- while (vm_domain_iterator_run(&vi, &domain) == 0) {
- if (vm_domain_iterator_isdone(&vi))
- req |= wait;
+ vm_domainset_iter_page_init(&di, object, &domain, &req);
+ do {
m = vm_page_alloc_contig_domain(object, pindex, domain, req,
npages, low, high, alignment, boundary, memattr);
if (m != NULL)
break;
- }
- vm_policy_iterator_finish(&vi);
+ } while (vm_domainset_iter_page(&di, &domain, &req) == 0);
return (m);
}
@@ -2045,22 +2040,16 @@ vm_page_alloc_check(vm_page_t m)
vm_page_t
vm_page_alloc_freelist(int freelist, int req)
{
- struct vm_domain_iterator vi;
+ struct vm_domainset_iter di;
vm_page_t m;
- int domain, wait;
+ int domain;
- m = NULL;
- vm_policy_iterator_init(&vi);
- wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK);
- req &= ~wait;
- while (vm_domain_iterator_run(&vi, &domain) == 0) {
- if (vm_domain_iterator_isdone(&vi))
- req |= wait;
+ vm_domainset_iter_page_init(&di, kernel_object, &domain, &req);
+ do {
m = vm_page_alloc_freelist_domain(domain, freelist, req);
if (m != NULL)
break;
- }
- vm_policy_iterator_finish(&vi);
+ } while (vm_domainset_iter_page(&di, &domain, &req) == 0);
return (m);
}
@@ -2562,8 +2551,8 @@ CTASSERT(powerof2(NRUNS));
* must be a power of two.
*/
bool
-vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
- u_long alignment, vm_paddr_t boundary)
+vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
+ vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
{
vm_paddr_t curr_low;
vm_page_t m_run, m_runs[NRUNS];
@@ -2603,8 +2592,8 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
curr_low = low;
count = 0;
for (;;) {
- m_run = vm_phys_scan_contig(npages, curr_low, high,
- alignment, boundary, options);
+ m_run = vm_phys_scan_contig(domain, npages, curr_low,
+ high, alignment, boundary, options);
if (m_run == NULL)
break;
curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
@@ -2645,6 +2634,26 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
}
}
+bool
+vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary)
+{
+ struct vm_domainset_iter di;
+ int domain;
+ bool ret;
+
+ vm_domainset_iter_page_init(&di, kernel_object, &domain, &req);
+ do {
+ ret = vm_page_reclaim_contig_domain(domain, req, npages, low,
+ high, alignment, boundary);
+ if (ret)
+ break;
+ } while (vm_domainset_iter_page(&di, &domain, &req) == 0);
+
+ return (ret);
+}
+
+
/*
* vm_wait: (also see VM_WAIT macro)
*
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 70aa20c056d2..2b95b6209d22 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -229,6 +229,7 @@ struct vm_pagequeue {
struct vm_domain {
struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
+ struct vmem *vmd_kernel_arena;
u_int vmd_page_count;
u_int vmd_free_count;
long vmd_segs; /* bitmask of the segments */
@@ -514,7 +515,7 @@ void vm_page_putfake(vm_page_t m);
void vm_page_readahead_finish(vm_page_t m);
bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
-bool vm_page_reclaim_contig_domain(int req, u_long npages, int domain,
+bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
void vm_page_reference(vm_page_t m);
void vm_page_remove (vm_page_t);
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
index 6c796900fc18..26cfd7a77ae2 100644
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
-#include <vm/vm_domain.h>
-
_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
"Too many physsegs.");
@@ -973,7 +971,7 @@ vm_phys_free_contig(vm_page_t m, u_long npages)
* be a power of two.
*/
vm_page_t
-vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
u_long alignment, vm_paddr_t boundary, int options)
{
vm_paddr_t pa_end;
@@ -988,6 +986,8 @@ vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
return (NULL);
for (segind = 0; segind < vm_phys_nsegs; segind++) {
seg = &vm_phys_segs[segind];
+ if (seg->domain != domain)
+ continue;
if (seg->start >= high)
break;
if (low >= seg->end)
diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h
index f5524af39cac..e2ec2b2a814b 100644
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@@ -86,8 +86,8 @@ void vm_phys_free_contig(vm_page_t m, u_long npages);
void vm_phys_free_pages(vm_page_t m, int order);
void vm_phys_init(void);
vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
-vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
- u_long alignment, vm_paddr_t boundary, int options);
+vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low,
+ vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options);
void vm_phys_set_pool(int pool, vm_page_t m, int order);
boolean_t vm_phys_unfree_page(vm_page_t m);
int vm_phys_mem_affinity(int f, int t);
diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c
index 1726a76eb291..df568cafcb26 100644
--- a/sys/x86/acpica/srat.c
+++ b/sys/x86/acpica/srat.c
@@ -252,7 +252,8 @@ srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg)
"enabled" : "disabled");
if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED))
break;
- if (!overlaps_phys_avail(mem->BaseAddress,
+ if (mem->BaseAddress >= cpu_getmaxphyaddr() ||
+ !overlaps_phys_avail(mem->BaseAddress,
mem->BaseAddress + mem->Length)) {
printf("SRAT: Ignoring memory at addr 0x%jx\n",
(uintmax_t)mem->BaseAddress);