diff options
author | Jeff Roberson <jeff@FreeBSD.org> | 2018-01-12 22:48:23 +0000 |
---|---|---|
committer | Jeff Roberson <jeff@FreeBSD.org> | 2018-01-12 22:48:23 +0000 |
commit | 3f289c3fcf39b200550e2702068014cdd801d4da (patch) | |
tree | 9105f2f717d4e6d5f5dc06d92b4b158e3c831d9e /sys | |
parent | fe8be58826d91f5b80b50bac161b727bd4ea9846 (diff) | |
download | src-3f289c3fcf39b200550e2702068014cdd801d4da.tar.gz src-3f289c3fcf39b200550e2702068014cdd801d4da.zip |
Implement 'domainset', a cpuset based NUMA policy mechanism. This allows
userspace to control NUMA policy administratively and programmatically.
Implement domainset based iterators in the page layer.
Remove the now legacy numa_* syscalls.
Cleanup some header polution created by having seq.h in proc.h.
Reviewed by: markj, kib
Discussed with: alc
Tested by: pho
Sponsored by: Netflix, Dell/EMC Isilon
Differential Revision: https://reviews.freebsd.org/D13403
Notes
Notes:
svn path=/head/; revision=327895
Diffstat (limited to 'sys')
33 files changed, 1130 insertions, 1069 deletions
diff --git a/sys/arm/arm/machdep_ptrace.c b/sys/arm/arm/machdep_ptrace.c index 5bd07d28eca7..2a64f89441cc 100644 --- a/sys/arm/arm/machdep_ptrace.c +++ b/sys/arm/arm/machdep_ptrace.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/proc.h> #include <sys/ptrace.h> +#include <sys/lock.h> #include <sys/mutex.h> #include <machine/machdep.h> diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c index 4eaa06e8ec71..4849b80f421f 100644 --- a/sys/compat/freebsd32/freebsd32_misc.c +++ b/sys/compat/freebsd32/freebsd32_misc.c @@ -3017,6 +3017,24 @@ freebsd32_cpuset_setaffinity(struct thread *td, } int +freebsd32_cpuset_getdomain(struct thread *td, + struct freebsd32_cpuset_getdomain_args *uap) +{ + + return (kern_cpuset_getdomain(td, uap->level, uap->which, + PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy)); +} + +int +freebsd32_cpuset_setdomain(struct thread *td, + struct freebsd32_cpuset_setdomain_args *uap) +{ + + return (kern_cpuset_setdomain(td, uap->level, uap->which, + PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy)); +} + +int freebsd32_nmount(struct thread *td, struct freebsd32_nmount_args /* { struct iovec *iovp; diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 0c94148ad461..707d80824727 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -1086,12 +1086,8 @@ 547 AUE_FUTIMESAT STD { int freebsd32_utimensat(int fd, \ char *path, \ struct timespec *times, int flag); } -548 AUE_NULL NOPROTO { int numa_getaffinity(cpuwhich_t which, \ - id_t id, \ - struct vm_domain_policy *policy); } -549 AUE_NULL NOPROTO { int numa_setaffinity(cpuwhich_t which, \ - id_t id, \ - const struct vm_domain_policy *policy); } +548 AUE_NULL UNIMPL numa_getaffinity +549 AUE_NULL UNIMPL numa_setaffinity 550 AUE_FSYNC NOPROTO { int fdatasync(int fd); } 551 AUE_FSTAT STD { int freebsd32_fstat(int fd, \ struct stat32 *ub); } @@ -1119,4 +1115,13 @@ struct kevent32 *eventlist, \ int nevents, \ const struct timespec32 *timeout); } +561 AUE_NULL STD { int freebsd32_cpuset_getdomain(cpulevel_t level, \ + cpuwhich_t which, uint32_t id1, uint32_t id2, \ + size_t domainsetsize, domainset_t *mask, \ + int *policy); } +562 AUE_NULL STD { int freebsd32_cpuset_setdomain(cpulevel_t level, \ + cpuwhich_t which, uint32_t id1, uint32_t id2, \ + size_t domainsetsize, domainset_t *mask, \ + int policy); } + ; vim: syntax=off diff --git a/sys/conf/files b/sys/conf/files index b723aa52f691..da73f12201ff 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3787,7 +3787,6 @@ kern/kern_module.c standard kern/kern_mtxpool.c standard kern/kern_mutex.c standard kern/kern_ntptime.c standard -kern/kern_numa.c standard kern/kern_osd.c standard kern/kern_physio.c standard kern/kern_pmc.c standard @@ -4837,7 +4836,7 @@ vm/swap_pager.c standard vm/uma_core.c standard vm/uma_dbg.c standard vm/memguard.c optional DEBUG_MEMGUARD -vm/vm_domain.c standard +vm/vm_domainset.c standard vm/vm_fault.c standard vm/vm_glue.c standard vm/vm_init.c standard diff --git a/sys/ddb/db_run.c b/sys/ddb/db_run.c index a55fcea9e632..bf38a4d2f726 100644 --- a/sys/ddb/db_run.c +++ b/sys/ddb/db_run.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kdb.h> #include <sys/proc.h> +#include <sys/systm.h> #include <machine/kdb.h> #include <machine/pcb.h> diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 261006199120..397bc99452eb 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -89,7 +89,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_param.h> #include <vm/pmap.h> #include <vm/vm_map.h> -#include <vm/vm_domain.h> #include <sys/copyright.h> #include <ddb/ddb.h> @@ -497,10 +496,7 @@ proc0_init(void *dummy __unused) td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); - vm_domain_policy_init(&td->td_vm_dom_policy); - vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); - vm_domain_policy_init(&p->p_vm_dom_policy); - vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); + td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); p->p_peers = 0; p->p_leader = p; diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 58eedb2a058a..700feabaa1f6 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -599,8 +599,8 @@ struct sysent sysent[] = { { AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = ppoll */ { AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = futimens */ { AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = utimensat */ - { AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 548 = numa_getaffinity */ - { AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 549 = numa_setaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = numa_getaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = numa_setaffinity */ { AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 550 = fdatasync */ { AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 551 = fstat */ { AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = fstatat */ @@ -612,4 +612,6 @@ struct sysent sysent[] = { { AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 558 = fhstatfs */ { AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 559 = mknodat */ { AS(kevent_args), (sy_call_t *)sys_kevent, AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 560 = kevent */ + { AS(cpuset_getdomain_args), (sy_call_t *)sys_cpuset_getdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 561 = cpuset_getdomain */ + { AS(cpuset_setdomain_args), (sy_call_t *)sys_cpuset_setdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 562 = cpuset_setdomain */ }; diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index c913647cda2c..cf51560e8e7f 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -51,17 +51,21 @@ __FBSDID("$FreeBSD$"); #include <sys/syscallsubr.h> #include <sys/capsicum.h> #include <sys/cpuset.h> +#include <sys/domainset.h> #include <sys/sx.h> #include <sys/queue.h> #include <sys/libkern.h> #include <sys/limits.h> #include <sys/bus.h> #include <sys/interrupt.h> +#include <sys/vmmeter.h> #include <vm/uma.h> #include <vm/vm.h> +#include <vm/vm_object.h> #include <vm/vm_page.h> #include <vm/vm_param.h> +#include <vm/vm_phys.h> #ifdef DDB #include <ddb/ddb.h> @@ -109,8 +113,10 @@ __FBSDID("$FreeBSD$"); * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ static uma_zone_t cpuset_zone; +static uma_zone_t domainset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; +static struct domainlist cpuset_domains; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default; @@ -121,6 +127,32 @@ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, cpuset_t *cpuset_root; cpuset_t cpuset_domain[MAXMEMDOM]; +static int domainset_valid(const struct domainset *, const struct domainset *); + +/* + * Find the first non-anonymous set starting from 'set'. + */ +static struct cpuset * +cpuset_getbase(struct cpuset *set) +{ + + if (set->cs_id == CPUSET_INVALID) + set = set->cs_parent; + return (set); +} + +/* + * Walks up the tree from 'set' to find the root. + */ +static struct cpuset * +cpuset_getroot(struct cpuset *set) +{ + + while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL) + set = set->cs_parent; + return (set); +} + /* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ @@ -140,12 +172,7 @@ static struct cpuset * cpuset_refroot(struct cpuset *set) { - for (; set->cs_parent != NULL; set = set->cs_parent) - if (set->cs_flags & CPU_SET_ROOT) - break; - cpuset_ref(set); - - return (set); + return (cpuset_ref(cpuset_getroot(set))); } /* @@ -157,11 +184,7 @@ static struct cpuset * cpuset_refbase(struct cpuset *set) { - if (set->cs_id == CPUSET_INVALID) - set = set->cs_parent; - cpuset_ref(set); - - return (set); + return (cpuset_ref(cpuset_getbase(set))); } /* @@ -257,17 +280,25 @@ cpuset_lookup(cpusetid_t setid, struct thread *td) * will have no valid cpu based on restrictions from the parent. */ static int -_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, - cpusetid_t id) +_cpuset_create(struct cpuset *set, struct cpuset *parent, + const cpuset_t *mask, struct domainset *domain, cpusetid_t id) { + if (domain == NULL) + domain = parent->cs_domain; + if (mask == NULL) + mask = &parent->cs_mask; if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); + /* The domain must be prepared ahead of time. */ + if (!domainset_valid(parent->cs_domain, domain)) + return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); + set->cs_domain = domain; CPU_AND(&set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); @@ -294,8 +325,8 @@ cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); - *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); - error = _cpuset_create(set, parent, mask, id); + *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); + error = _cpuset_create(set, parent, mask, NULL, id); if (error == 0) return (0); free_unr(cpuset_unr, id); @@ -304,6 +335,206 @@ cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) return (error); } +static void +cpuset_freelist_add(struct setlist *list, int count) +{ + struct cpuset *set; + int i; + + for (i = 0; i < count; i++) { + set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK); + LIST_INSERT_HEAD(list, set, cs_link); + } +} + +static void +cpuset_freelist_init(struct setlist *list, int count) +{ + + LIST_INIT(list); + cpuset_freelist_add(list, count); +} + +static void +cpuset_freelist_free(struct setlist *list) +{ + struct cpuset *set; + + while ((set = LIST_FIRST(list)) != NULL) { + LIST_REMOVE(set, cs_link); + uma_zfree(cpuset_zone, set); + } +} + +static void +domainset_freelist_add(struct domainlist *list, int count) +{ + struct domainset *set; + int i; + + for (i = 0; i < count; i++) { + set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK); + LIST_INSERT_HEAD(list, set, ds_link); + } +} + +static void +domainset_freelist_init(struct domainlist *list, int count) +{ + + LIST_INIT(list); + domainset_freelist_add(list, count); +} + +static void +domainset_freelist_free(struct domainlist *list) +{ + struct domainset *set; + + while ((set = LIST_FIRST(list)) != NULL) { + LIST_REMOVE(set, ds_link); + uma_zfree(domainset_zone, set); + } +} + +/* Copy a domainset preserving mask and policy. */ +static void +domainset_copy(const struct domainset *from, struct domainset *to) +{ + + DOMAINSET_COPY(&from->ds_mask, &to->ds_mask); + to->ds_policy = from->ds_policy; + to->ds_prefer = from->ds_prefer; +} + +/* Return 1 if mask and policy are equal, otherwise 0. */ +static int +domainset_equal(const struct domainset *one, const struct domainset *two) +{ + + return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 && + one->ds_policy == two->ds_policy && + one->ds_prefer == two->ds_prefer); +} + +/* Return 1 if child is a valid subset of parent. */ +static int +domainset_valid(const struct domainset *parent, const struct domainset *child) +{ + if (child->ds_policy != DOMAINSET_POLICY_PREFER) + return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask)); + return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); +} + +static int +domainset_restrict(const struct domainset *parent, + const struct domainset *child) +{ + if (child->ds_policy != DOMAINSET_POLICY_PREFER) + return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask)); + return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); +} + +/* + * Lookup or create a domainset. The key is provided in ds_mask and + * ds_policy. If the domainset does not yet exist the storage in + * 'domain' is used to insert. Otherwise this storage is freed to the + * domainset_zone and the existing domainset is returned. + */ +static struct domainset * +_domainset_create(struct domainset *domain, struct domainlist *freelist) +{ + struct domainset *ndomain; + + mtx_lock_spin(&cpuset_lock); + LIST_FOREACH(ndomain, &cpuset_domains, ds_link) + if (domainset_equal(ndomain, domain)) + break; + /* + * If the domain does not yet exist we insert it and initialize + * various iteration helpers which are not part of the key. + */ + if (ndomain == NULL) { + LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); + domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); + domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1; + } + mtx_unlock_spin(&cpuset_lock); + if (ndomain == NULL) + return (domain); + if (freelist != NULL) + LIST_INSERT_HEAD(freelist, domain, ds_link); + else + uma_zfree(domainset_zone, domain); + return (ndomain); + +} + +/* + * Create or lookup a domainset based on the key held in 'domain'. + */ +static struct domainset * +domainset_create(const struct domainset *domain) +{ + struct domainset *ndomain; + + ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); + domainset_copy(domain, ndomain); + return _domainset_create(ndomain, NULL); +} + +/* + * Update thread domainset pointers. + */ +static void +domainset_notify(void) +{ + struct thread *td; + struct proc *p; + + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW) { + PROC_UNLOCK(p); + continue; + } + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + td->td_domain.dr_policy = td->td_cpuset->cs_domain; + thread_unlock(td); + } + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + kernel_object->domain.dr_policy = cpuset_default->cs_domain; +} + +/* + * Create a new set that is a subset of a parent. + */ +static struct domainset * +domainset_shadow(const struct domainset *pdomain, + const struct domainset *domain, struct domainlist *freelist) +{ + struct domainset *ndomain; + + ndomain = LIST_FIRST(freelist); + LIST_REMOVE(ndomain, ds_link); + + /* + * Initialize the key from the request. + */ + domainset_copy(domain, ndomain); + + /* + * Restrict the key by the parent. + */ + DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask); + + return _domainset_create(ndomain, freelist); +} + /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become @@ -376,10 +607,12 @@ cpuset_modify(struct cpuset *set, cpuset_t *mask) * Verify that we have access to this set of * cpus. */ - root = set->cs_parent; - if (root && !CPU_SUBSET(&root->cs_mask, mask)) - return (EINVAL); + root = cpuset_getroot(set); mtx_lock_spin(&cpuset_lock); + if (root && !CPU_SUBSET(&root->cs_mask, mask)) { + error = EINVAL; + goto out; + } error = cpuset_testupdate(set, mask, 0); if (error) goto out; @@ -392,6 +625,141 @@ out: } /* + * Recursively check for errors that would occur from applying mask to + * the tree of sets starting at 'set'. Checks for sets that would become + * empty as well as RDONLY flags. + */ +static int +cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset, + struct domainset *orig, int *count, int check_mask) +{ + struct cpuset *nset; + struct domainset *domain; + struct domainset newset; + int error; + + mtx_assert(&cpuset_lock, MA_OWNED); + if (set->cs_flags & CPU_SET_RDONLY) + return (EPERM); + domain = set->cs_domain; + domainset_copy(domain, &newset); + if (!domainset_equal(domain, orig)) { + if (!domainset_restrict(domain, dset)) + return (EDEADLK); + DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask); + /* Count the number of domains that are changing. */ + (*count)++; + } + error = 0; + LIST_FOREACH(nset, &set->cs_children, cs_siblings) + if ((error = cpuset_testupdate_domain(nset, &newset, domain, + count, 1)) != 0) + break; + return (error); +} + +/* + * Applies the mask 'mask' without checking for empty sets or permissions. + */ +static void +cpuset_update_domain(struct cpuset *set, struct domainset *domain, + struct domainset *orig, struct domainlist *domains) +{ + struct cpuset *nset; + + mtx_assert(&cpuset_lock, MA_OWNED); + /* + * If this domainset has changed from the parent we must calculate + * a new set. Otherwise it simply inherits from the parent. When + * we inherit from the parent we get a new mask and policy. If the + * set is modified from the parent we keep the policy and only + * update the mask. + */ + if (set->cs_domain != orig) { + orig = set->cs_domain; + set->cs_domain = domainset_shadow(domain, orig, domains); + } else + set->cs_domain = domain; + LIST_FOREACH(nset, &set->cs_children, cs_siblings) + cpuset_update_domain(nset, set->cs_domain, orig, domains); + + return; +} + +/* + * Modify the set 'set' to use a copy the domainset provided. Apply this new + * mask to restrict all children in the tree. Checks for validity before + * applying the changes. + */ +static int +cpuset_modify_domain(struct cpuset *set, struct domainset *domain) +{ + struct domainlist domains; + struct domainset temp; + struct domainset *dset; + struct cpuset *root; + int ndomains, needed; + int error; + + error = priv_check(curthread, PRIV_SCHED_CPUSET); + if (error) + return (error); + /* + * In case we are called from within the jail + * we do not allow modifying the dedicated root + * cpuset of the jail but may still allow to + * change child sets. + */ + if (jailed(curthread->td_ucred) && + set->cs_flags & CPU_SET_ROOT) + return (EPERM); + domainset_freelist_init(&domains, 0); + domain = domainset_create(domain); + ndomains = needed = 0; + do { + if (ndomains < needed) { + domainset_freelist_add(&domains, needed - ndomains); + ndomains = needed; + } + root = cpuset_getroot(set); + mtx_lock_spin(&cpuset_lock); + dset = root->cs_domain; + /* + * Verify that we have access to this set of domains. + */ + if (root && !domainset_valid(dset, domain)) { + error = EINVAL; + goto out; + } + /* + * If applying prefer we keep the current set as the fallback. + */ + if (domain->ds_policy == DOMAINSET_POLICY_PREFER) + DOMAINSET_COPY(&set->cs_domain->ds_mask, + &domain->ds_mask); + /* + * Determine whether we can apply this set of domains and + * how many new domain structures it will require. + */ + domainset_copy(domain, &temp); + needed = 0; + error = cpuset_testupdate_domain(set, &temp, set->cs_domain, + &needed, 0); + if (error) + goto out; + } while (ndomains < needed); + dset = set->cs_domain; + cpuset_update_domain(set, domain, dset, &domains); +out: + mtx_unlock_spin(&cpuset_lock); + domainset_freelist_free(&domains); + if (error == 0) + domainset_notify(); + + return (error); +} + +/* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also @@ -481,44 +849,203 @@ cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, return (0); } +static int +cpuset_testshadow(struct cpuset *set, const cpuset_t *mask, + const struct domainset *domain) +{ + struct cpuset *parent; + struct domainset *dset; + + parent = cpuset_getbase(set); + /* + * If we are restricting a cpu mask it must be a subset of the + * parent or invalid CPUs have been specified. + */ + if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask)) + return (EINVAL); + + /* + * If we are restricting a domain mask it must be a subset of the + * parent or invalid domains have been specified. + */ + dset = parent->cs_domain; + if (domain != NULL && !domainset_valid(dset, domain)) + return (EINVAL); + + return (0); +} + /* * Create an anonymous set with the provided mask in the space provided by - * 'fset'. If the passed in set is anonymous we use its parent otherwise + * 'nset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int -cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask) +cpuset_shadow(struct cpuset *set, struct cpuset **nsetp, + const cpuset_t *mask, const struct domainset *domain, + struct setlist *cpusets, struct domainlist *domains) { struct cpuset *parent; + struct cpuset *nset; + struct domainset *dset; + struct domainset *d; + int error; - if (set->cs_id == CPUSET_INVALID) - parent = set->cs_parent; + error = cpuset_testshadow(set, mask, domain); + if (error) + return (error); + + parent = cpuset_getbase(set); + dset = parent->cs_domain; + if (mask == NULL) + mask = &set->cs_mask; + if (domain != NULL) + d = domainset_shadow(dset, domain, domains); else - parent = set; - if (!CPU_SUBSET(&parent->cs_mask, mask)) + d = set->cs_domain; + nset = LIST_FIRST(cpusets); + error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID); + if (error == 0) { + LIST_REMOVE(nset, cs_link); + *nsetp = nset; + } + return (error); +} + +static struct cpuset * +cpuset_update_thread(struct thread *td, struct cpuset *nset) +{ + struct cpuset *tdset; + + tdset = td->td_cpuset; + td->td_cpuset = nset; + td->td_domain.dr_policy = nset->cs_domain; + sched_affinity(td); + + return (tdset); +} + +static int +cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask, + struct domainset *domain) +{ + struct cpuset *parent; + + parent = cpuset_getbase(tdset); + if (mask == NULL) + mask = &tdset->cs_mask; + if (domain == NULL) + domain = tdset->cs_domain; + return cpuset_testshadow(parent, mask, domain); +} + +static int +cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask, + struct domainset *domain, struct cpuset **nsetp, + struct setlist *freelist, struct domainlist *domainlist) +{ + struct cpuset *parent; + + parent = cpuset_getbase(tdset); + if (mask == NULL) + mask = &tdset->cs_mask; + if (domain == NULL) + domain = tdset->cs_domain; + return cpuset_shadow(parent, nsetp, mask, domain, freelist, + domainlist); +} + +static int +cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set, + cpuset_t *mask, struct domainset *domain) +{ + struct cpuset *parent; + + parent = cpuset_getbase(tdset); + + /* + * If the thread restricted its mask then apply that same + * restriction to the new set, otherwise take it wholesale. + */ + if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) { + CPU_COPY(&tdset->cs_mask, mask); + CPU_AND(mask, &set->cs_mask); + } else + CPU_COPY(&set->cs_mask, mask); + + /* + * If the thread restricted the domain then we apply the + * restriction to the new set but retain the policy. + */ + if (tdset->cs_domain != parent->cs_domain) { + domainset_copy(tdset->cs_domain, domain); + DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask); + } else + domainset_copy(set->cs_domain, domain); + + if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask)) return (EDEADLK); - return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); + + return (0); +} + +static int +cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set) +{ + struct domainset domain; + cpuset_t mask; + + if (tdset->cs_id != CPUSET_INVALID) + return (0); + return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); +} + +static int +cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, + struct cpuset **nsetp, struct setlist *freelist, + struct domainlist *domainlist) +{ + struct domainset domain; + cpuset_t mask; + int error; + + /* + * If we're replacing on a thread that has not constrained the + * original set we can simply accept the new set. + */ + if (tdset->cs_id != CPUSET_INVALID) { + *nsetp = cpuset_ref(set); + return (0); + } + error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); + if (error) + return (error); + + return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist, + domainlist); } /* - * Handle two cases for replacing the base set or mask of an entire process. + * Handle three cases for updating an entire process. * - * 1) Set is non-null and mask is null. This reparents all anonymous sets - * to the provided set and replaces all non-anonymous td_cpusets with the - * provided set. - * 2) Mask is non-null and set is null. This replaces or creates anonymous - * sets for every thread with the existing base as a parent. + * 1) Set is non-null. This reparents all anonymous sets to the provided + * set and replaces all non-anonymous td_cpusets with the provided set. + * 2) Mask is non-null. This replaces or creates anonymous sets for every + * thread with the existing base as a parent. + * 3) domain is non-null. This creates anonymous sets for every thread + * and replaces the domain set. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int -cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) +cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, + struct domainset *domain) { struct setlist freelist; struct setlist droplist; - struct cpuset *tdset; + struct domainlist domainlist; struct cpuset *nset; struct thread *td; struct proc *p; @@ -533,7 +1060,9 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ - LIST_INIT(&freelist); + cpuset_freelist_init(&freelist, 1); + domainset_freelist_init(&domainlist, 1); + nfree = 1; LIST_INIT(&droplist); nfree = 0; for (;;) { @@ -544,39 +1073,27 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) break; threads = p->p_numthreads; PROC_UNLOCK(p); - for (; nfree < threads; nfree++) { - nset = uma_zalloc(cpuset_zone, M_WAITOK); - LIST_INSERT_HEAD(&freelist, nset, cs_link); + if (nfree < threads) { + cpuset_freelist_add(&freelist, threads - nfree); + domainset_freelist_add(&domainlist, threads - nfree); + nfree = threads; } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * Now that the appropriate locks are held and we have enough cpusets, - * make sure the operation will succeed before applying changes. The + * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); - tdset = td->td_cpuset; - /* - * Verify that a new mask doesn't specify cpus outside of - * the set the thread is a member of. - */ - if (mask) { - if (tdset->cs_id == CPUSET_INVALID) - tdset = tdset->cs_parent; - if (!CPU_SUBSET(&tdset->cs_mask, mask)) - error = EDEADLK; - /* - * Verify that a new set won't leave an existing thread - * mask without a cpu to run on. It can, however, restrict - * the set. - */ - } else if (tdset->cs_id == CPUSET_INVALID) { - if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask)) - error = EDEADLK; - } + if (set != NULL) + error = cpuset_setproc_test_setthread(td->td_cpuset, + set); + else + error = cpuset_setproc_test_maskthread(td->td_cpuset, + mask, domain); thread_unlock(td); if (error) goto unlock_out; @@ -588,33 +1105,17 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); - /* - * If we presently have an anonymous set or are applying a - * mask we must create an anonymous shadow set. That is - * either parented to our existing base or the supplied set. - * - * If we have a base set with no anonymous shadow we simply - * replace it outright. - */ - tdset = td->td_cpuset; - if (tdset->cs_id == CPUSET_INVALID || mask) { - nset = LIST_FIRST(&freelist); - LIST_REMOVE(nset, cs_link); - if (mask) - error = cpuset_shadow(tdset, nset, mask); - else - error = _cpuset_create(nset, set, - &tdset->cs_mask, CPUSET_INVALID); - if (error) { - LIST_INSERT_HEAD(&freelist, nset, cs_link); - thread_unlock(td); - break; - } - } else - nset = cpuset_ref(set); - cpuset_rel_defer(&droplist, tdset); - td->td_cpuset = nset; - sched_affinity(td); + if (set != NULL) + error = cpuset_setproc_setthread(td->td_cpuset, set, + &nset, &freelist, &domainlist); + else + error = cpuset_setproc_maskthread(td->td_cpuset, mask, + domain, &nset, &freelist, &domainlist); + if (error) { + thread_unlock(td); + break; + } + cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset)); thread_unlock(td); } unlock_out: @@ -622,10 +1123,8 @@ unlock_out: out: while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); - while ((nset = LIST_FIRST(&freelist)) != NULL) { - LIST_REMOVE(nset, cs_link); - uma_zfree(cpuset_zone, nset); - } + cpuset_freelist_free(&freelist); + domainset_freelist_free(&domainlist); return (error); } @@ -690,46 +1189,57 @@ cpusetobj_strscan(cpuset_t *set, const char *buf) } /* - * Apply an anonymous mask to a single thread. + * Apply an anonymous mask or a domain to a single thread. */ -int -cpuset_setthread(lwpid_t id, cpuset_t *mask) +static int +_cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain) { + struct setlist cpusets; + struct domainlist domainlist; struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; - nset = uma_zalloc(cpuset_zone, M_WAITOK); + cpuset_freelist_init(&cpusets, 1); + domainset_freelist_init(&domainlist, domain != NULL); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); - error = cpuset_shadow(td->td_cpuset, nset, mask); - if (error == 0) { - set = td->td_cpuset; - td->td_cpuset = nset; - sched_affinity(td); - nset = NULL; - } + error = cpuset_shadow(td->td_cpuset, &nset, mask, domain, + &cpusets, &domainlist); + if (error == 0) + set = cpuset_update_thread(td, nset); thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: - if (nset) - uma_zfree(cpuset_zone, nset); + cpuset_freelist_free(&cpusets); + domainset_freelist_free(&domainlist); return (error); } /* + * Apply an anonymous mask to a single thread. + */ +int +cpuset_setthread(lwpid_t id, cpuset_t *mask) +{ + + return _cpuset_setthread(id, mask, NULL); +} + +/* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { + struct setlist cpusets; struct cpuset *nset, *rset; struct cpuset *parent, *old_set; struct thread *td; @@ -738,8 +1248,8 @@ cpuset_setithread(lwpid_t id, int cpu) cpuset_t mask; int error; - nset = uma_zalloc(cpuset_zone, M_WAITOK); - rset = uma_zalloc(cpuset_zone, M_WAITOK); + cpuset_freelist_init(&cpusets, 1); + rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); cs_id = CPUSET_INVALID; CPU_ZERO(&mask); @@ -756,13 +1266,15 @@ cpuset_setithread(lwpid_t id, int cpu) old_set = td->td_cpuset; if (cpu == NOCPU) { + nset = LIST_FIRST(&cpusets); + LIST_REMOVE(nset, cs_link); /* * roll back to default set. We're not using cpuset_shadow() * here because we can fail CPU_SUBSET() check. This can happen * if default set does not contain all CPUs. */ - error = _cpuset_create(nset, cpuset_default, &mask, + error = _cpuset_create(nset, cpuset_default, &mask, NULL, CPUSET_INVALID); goto applyset; @@ -779,7 +1291,7 @@ cpuset_setithread(lwpid_t id, int cpu) * with any mask. */ error = _cpuset_create(rset, cpuset_zero, - &cpuset_zero->cs_mask, cs_id); + &cpuset_zero->cs_mask, NULL, cs_id); if (error != 0) { PROC_UNLOCK(p); goto out; @@ -794,22 +1306,19 @@ cpuset_setithread(lwpid_t id, int cpu) old_set = NULL; } - error = cpuset_shadow(parent, nset, &mask); + error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL); applyset: if (error == 0) { thread_lock(td); - td->td_cpuset = nset; - sched_affinity(td); + old_set = cpuset_update_thread(td, nset); thread_unlock(td); - nset = NULL; } else old_set = NULL; PROC_UNLOCK(p); if (old_set != NULL) cpuset_rel(old_set); out: - if (nset != NULL) - uma_zfree(cpuset_zone, nset); + cpuset_freelist_free(&cpusets); if (rset != NULL) uma_zfree(cpuset_zone, rset); if (cs_id != CPUSET_INVALID) @@ -817,6 +1326,25 @@ out: return (error); } +static struct domainset domainset0; + +void +domainset_zero(void) +{ + struct domainset *dset; + int i; + + mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); + + dset = &domainset0; + DOMAINSET_ZERO(&dset->ds_mask); + for (i = 0; i < vm_ndomains; i++) + DOMAINSET_SET(i, &dset->ds_mask); + dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; + dset->ds_prefer = -1; + curthread->td_domain.dr_policy = _domainset_create(dset, NULL); + kernel_object->domain.dr_policy = curthread->td_domain.dr_policy; +} /* * Creates system-wide cpusets and the cpuset for thread0 including two @@ -834,11 +1362,12 @@ struct cpuset * cpuset_thread0(void) { struct cpuset *set; - int error, i; + int error; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); + domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Create the root system set for the whole machine. Doesn't use @@ -850,14 +1379,15 @@ cpuset_thread0(void) LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); set->cs_ref = 1; set->cs_flags = CPU_SET_ROOT; + set->cs_domain = &domainset0; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default, modifiable set from that to give out. */ - set = uma_zalloc(cpuset_zone, M_WAITOK); - error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); + set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); + error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; @@ -866,16 +1396,6 @@ cpuset_thread0(void) */ cpuset_unr = new_unrhdr(2, INT_MAX, NULL); - /* - * If MD code has not initialized per-domain cpusets, place all - * CPUs in domain 0. - */ - for (i = 0; i < MAXMEMDOM; i++) - if (!CPU_EMPTY(&cpuset_domain[i])) - goto domains_set; - CPU_COPY(&all_cpus, &cpuset_domain[0]); -domains_set: - return (set); } @@ -920,7 +1440,7 @@ cpuset_setproc_update_set(struct proc *p, struct cpuset *set) KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); - error = cpuset_setproc(p->p_pid, set, NULL); + error = cpuset_setproc(p->p_pid, set, NULL, NULL); if (error) return (error); cpuset_rel(set); @@ -935,11 +1455,23 @@ static void cpuset_init(void *arg) { cpuset_t mask; + int i; mask = all_cpus; if (cpuset_modify(cpuset_zero, &mask)) panic("Can't set initial cpuset mask.\n"); cpuset_zero->cs_flags |= CPU_SET_RDONLY; + + /* + * If MD code has not initialized per-domain cpusets, place all + * CPUs in domain 0. + */ + for (i = 0; i < MAXMEMDOM; i++) + if (!CPU_EMPTY(&cpuset_domain[i])) + goto domains_set; + CPU_COPY(&all_cpus, &cpuset_domain[0]); +domains_set: + return; } SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); @@ -964,7 +1496,7 @@ sys_cpuset(struct thread *td, struct cpuset_args *uap) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) - error = cpuset_setproc(-1, set, NULL); + error = cpuset_setproc(-1, set, NULL, NULL); cpuset_rel(set); return (error); } @@ -998,7 +1530,7 @@ kern_cpuset_setid(struct thread *td, cpuwhich_t which, set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); - error = cpuset_setproc(id, set, NULL); + error = cpuset_setproc(id, set, NULL, NULL); cpuset_rel(set); return (error); } @@ -1102,12 +1634,12 @@ kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, return (ERANGE); /* In Capability mode, you can only get your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { - if (level != CPU_LEVEL_WHICH) - return (ECAPMODE); - if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) - return (ECAPMODE); - if (id != -1) - return (ECAPMODE); + if (level != CPU_LEVEL_WHICH) + return (ECAPMODE); + if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) + return (ECAPMODE); + if (id != -1) + return (ECAPMODE); } size = cpusetsize; mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); @@ -1219,12 +1751,12 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, return (ERANGE); /* In Capability mode, you can only set your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { - if (level != CPU_LEVEL_WHICH) - return (ECAPMODE); - if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) - return (ECAPMODE); - if (id != -1) - return (ECAPMODE); + if (level != CPU_LEVEL_WHICH) + return (ECAPMODE); + if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) + return (ECAPMODE); + if (id != -1) + return (ECAPMODE); } mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO); error = copyin(maskp, mask, cpusetsize); @@ -1285,7 +1817,7 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: - error = cpuset_setproc(id, NULL, mask); + error = cpuset_setproc(id, NULL, mask, NULL); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: @@ -1314,25 +1846,316 @@ out: return (error); } +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_getdomain_args { + cpulevel_t level; + cpuwhich_t which; + id_t id; + size_t domainsetsize; + domainset_t *mask; + int *policy; +}; +#endif +int +sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap) +{ + + return (kern_cpuset_getdomain(td, uap->level, uap->which, + uap->id, uap->domainsetsize, uap->mask, uap->policy)); +} + +int +kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, + id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp) +{ + struct domainset outset; + struct thread *ttd; + struct cpuset *nset; + struct cpuset *set; + struct domainset *dset; + struct proc *p; + domainset_t *mask; + int error; + + if (domainsetsize < sizeof(domainset_t) || + domainsetsize > DOMAINSET_MAXSIZE / NBBY) + return (ERANGE); + /* In Capability mode, you can only get your own domain set. */ + if (IN_CAPABILITY_MODE(td)) { + if (level != CPU_LEVEL_WHICH) + return (ECAPMODE); + if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) + return (ECAPMODE); + if (id != -1) + return (ECAPMODE); + } + mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); + bzero(&outset, sizeof(outset)); + error = cpuset_which(which, id, &p, &ttd, &set); + if (error) + goto out; + switch (level) { + case CPU_LEVEL_ROOT: + case CPU_LEVEL_CPUSET: + switch (which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_ref(ttd->td_cpuset); + thread_unlock(ttd); + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + case CPU_WHICH_DOMAIN: + error = EINVAL; + goto out; + } + if (level == CPU_LEVEL_ROOT) + nset = cpuset_refroot(set); + else + nset = cpuset_refbase(set); + domainset_copy(nset->cs_domain, &outset); + cpuset_rel(nset); + break; + case CPU_LEVEL_WHICH: + switch (which) { + case CPU_WHICH_TID: + thread_lock(ttd); + domainset_copy(ttd->td_cpuset->cs_domain, &outset); + thread_unlock(ttd); + break; + case CPU_WHICH_PID: + FOREACH_THREAD_IN_PROC(p, ttd) { + thread_lock(ttd); + dset = ttd->td_cpuset->cs_domain; + /* Show all domains in the proc. */ + DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask); + /* Last policy wins. */ + outset.ds_policy = dset->ds_policy; + outset.ds_prefer = dset->ds_prefer; + thread_unlock(ttd); + } + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + domainset_copy(set->cs_domain, &outset); + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + case CPU_WHICH_DOMAIN: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + if (set) + cpuset_rel(set); + if (p) + PROC_UNLOCK(p); + /* + * Translate prefer into a set containing only the preferred domain, + * not the entire fallback set. + */ + if (outset.ds_policy == DOMAINSET_POLICY_PREFER) { + DOMAINSET_ZERO(&outset.ds_mask); + DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask); + } + DOMAINSET_COPY(&outset.ds_mask, mask); + if (error == 0) + error = copyout(mask, maskp, domainsetsize); + if (error == 0) + error = copyout(&outset.ds_policy, policyp, + sizeof(outset.ds_policy)); +out: + free(mask, M_TEMP); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_setdomain_args { + cpulevel_t level; + cpuwhich_t which; + id_t id; + size_t domainsetsize; + domainset_t *mask; + int policy; +}; +#endif +int +sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) +{ + + return (kern_cpuset_setdomain(td, uap->level, uap->which, + uap->id, uap->domainsetsize, uap->mask, uap->policy)); +} + +int +kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, + id_t id, size_t domainsetsize, const domainset_t *maskp, int policy) +{ + struct cpuset *nset; + struct cpuset *set; + struct thread *ttd; + struct proc *p; + struct domainset domain; + domainset_t *mask; + int error; + + if (domainsetsize < sizeof(domainset_t) || + domainsetsize > DOMAINSET_MAXSIZE / NBBY) + return (ERANGE); + /* In Capability mode, you can only set your own CPU set. */ + if (IN_CAPABILITY_MODE(td)) { + if (level != CPU_LEVEL_WHICH) + return (ECAPMODE); + if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) + return (ECAPMODE); + if (id != -1) + return (ECAPMODE); + } + memset(&domain, 0, sizeof(domain)); + mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); + error = copyin(maskp, mask, domainsetsize); + if (error) + goto out; + /* + * Verify that no high bits are set. + */ + if (domainsetsize > sizeof(domainset_t)) { + char *end; + char *cp; + + end = cp = (char *)&mask->__bits; + end += domainsetsize; + cp += sizeof(domainset_t); + while (cp != end) + if (*cp++ != 0) { + error = EINVAL; + goto out; + } + + } + DOMAINSET_COPY(mask, &domain.ds_mask); + domain.ds_policy = policy; + if (policy <= DOMAINSET_POLICY_INVALID || + policy > DOMAINSET_POLICY_MAX) + return (EINVAL); + + /* Translate preferred policy into a mask and fallback. */ + if (policy == DOMAINSET_POLICY_PREFER) { + /* Only support a single preferred domain. */ + if (DOMAINSET_COUNT(&domain.ds_mask) != 1) + return (EINVAL); + domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; + /* This will be constrained by domainset_shadow(). */ + DOMAINSET_FILL(&domain.ds_mask); + } + + switch (level) { + case CPU_LEVEL_ROOT: + case CPU_LEVEL_CPUSET: + error = cpuset_which(which, id, &p, &ttd, &set); + if (error) + break; + switch (which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_ref(ttd->td_cpuset); + thread_unlock(ttd); + PROC_UNLOCK(p); + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + case CPU_WHICH_DOMAIN: + error = EINVAL; + goto out; + } + if (level == CPU_LEVEL_ROOT) + nset = cpuset_refroot(set); + else + nset = cpuset_refbase(set); + error = cpuset_modify_domain(nset, &domain); + cpuset_rel(nset); + cpuset_rel(set); + break; + case CPU_LEVEL_WHICH: + switch (which) { + case CPU_WHICH_TID: + error = _cpuset_setthread(id, NULL, &domain); + break; + case CPU_WHICH_PID: + error = cpuset_setproc(id, NULL, NULL, &domain); + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + error = cpuset_which(which, id, &p, &ttd, &set); + if (error == 0) { + error = cpuset_modify_domain(set, &domain); + cpuset_rel(set); + } + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } +out: + free(mask, M_TEMP); + return (error); +} + #ifdef DDB -void -ddb_display_cpuset(const cpuset_t *set) +BITSET_DEFINE(bitset, 1); +static void +ddb_display_bitset(const struct bitset *set, int size) { - int cpu, once; + int bit, once; - for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { - if (CPU_ISSET(cpu, set)) { + for (once = 0, bit = 0; bit < size; bit++) { + if (CPU_ISSET(bit, set)) { if (once == 0) { - db_printf("%d", cpu); + db_printf("%d", bit); once = 1; } else - db_printf(",%d", cpu); + db_printf(",%d", bit); } } if (once == 0) db_printf("<none>"); } +void +ddb_display_cpuset(const cpuset_t *set) +{ + ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE); +} + +static void +ddb_display_domainset(const domainset_t *set) +{ + ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE); +} + DB_SHOW_COMMAND(cpusets, db_show_cpusets) { struct cpuset *set; @@ -1341,11 +2164,29 @@ DB_SHOW_COMMAND(cpusets, db_show_cpusets) db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, set->cs_ref, set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); - db_printf(" mask="); + db_printf(" cpu mask="); ddb_display_cpuset(&set->cs_mask); db_printf("\n"); + db_printf(" domain policy %d prefer %d mask=", + set->cs_domain->ds_policy, set->cs_domain->ds_prefer); + ddb_display_domainset(&set->cs_domain->ds_mask); + db_printf("\n"); if (db_pager_quit) break; } } + +DB_SHOW_COMMAND(domainsets, db_show_domainsets) +{ + struct domainset *set; + + LIST_FOREACH(set, &cpuset_domains, ds_link) { + db_printf("set=%p policy %d prefer %d cnt %d max %d\n", + set, set->ds_policy, set->ds_prefer, set->ds_cnt, + set->ds_max); + db_printf(" mask ="); + ddb_display_domainset(&set->ds_mask); + db_printf("\n"); + } +} #endif /* DDB */ diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 12a7b5d5a6d5..9db57f58192a 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -88,7 +88,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_map.h> #include <vm/vm_page.h> #include <vm/uma.h> -#include <vm/vm_domain.h> #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> @@ -931,10 +930,6 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options) #ifdef MAC mac_proc_destroy(p); #endif - /* - * Free any domain policy that's still hiding around. - */ - vm_domain_policy_cleanup(&p->p_vm_dom_policy); KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 0367cdd7f187..804409e2f3e5 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -83,7 +83,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_map.h> #include <vm/vm_extern.h> #include <vm/uma.h> -#include <vm/vm_domain.h> #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> @@ -512,14 +511,6 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread * if (p1->p_flag & P_PROFIL) startprofclock(p2); - /* - * Whilst the proc lock is held, copy the VM domain data out - * using the VM domain method. - */ - vm_domain_policy_init(&p2->p_vm_dom_policy); - vm_domain_policy_localcopy(&p2->p_vm_dom_policy, - &p1->p_vm_dom_policy); - if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { diff --git a/sys/kern/kern_numa.c b/sys/kern/kern_numa.c deleted file mode 100644 index 2d3ec49c15eb..000000000000 --- a/sys/kern/kern_numa.c +++ /dev/null @@ -1,169 +0,0 @@ -/*- - * Copyright (c) 2015, Adrian Chadd <adrian@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/sysproto.h> -#include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/mutex.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/refcount.h> -#include <sys/sched.h> -#include <sys/smp.h> -#include <sys/syscallsubr.h> -#include <sys/cpuset.h> -#include <sys/sx.h> -#include <sys/queue.h> -#include <sys/libkern.h> -#include <sys/limits.h> -#include <sys/bus.h> -#include <sys/interrupt.h> - -#include <vm/uma.h> -#include <vm/vm.h> -#include <vm/vm_page.h> -#include <vm/vm_param.h> -#include <vm/vm_domain.h> - -int -sys_numa_setaffinity(struct thread *td, struct numa_setaffinity_args *uap) -{ - int error; - struct vm_domain_policy vp; - struct thread *ttd; - struct proc *p; - struct cpuset *set; - - set = NULL; - p = NULL; - - /* - * Copy in just the policy information into the policy - * struct. Userland only supplies vm_domain_policy_entry. - */ - error = copyin(uap->policy, &vp.p, sizeof(vp.p)); - if (error) - goto out; - - /* - * Ensure the seq number is zero - otherwise seq.h - * may get very confused. - */ - vp.seq = 0; - - /* - * Validate policy. - */ - if (vm_domain_policy_validate(&vp) != 0) { - error = EINVAL; - goto out; - } - - /* - * Go find the desired proc/tid for this operation. - */ - error = cpuset_which(uap->which, uap->id, &p, - &ttd, &set); - if (error) - goto out; - - /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ - /* - * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, - * it'll return ESRCH. We should just return EINVAL. - */ - switch (uap->which) { - case CPU_WHICH_TID: - vm_domain_policy_copy(&ttd->td_vm_dom_policy, &vp); - break; - case CPU_WHICH_PID: - vm_domain_policy_copy(&p->p_vm_dom_policy, &vp); - break; - default: - error = EINVAL; - break; - } - - PROC_UNLOCK(p); -out: - if (set) - cpuset_rel(set); - return (error); -} - -int -sys_numa_getaffinity(struct thread *td, struct numa_getaffinity_args *uap) -{ - int error; - struct vm_domain_policy vp; - struct thread *ttd; - struct proc *p; - struct cpuset *set; - - set = NULL; - p = NULL; - - error = cpuset_which(uap->which, uap->id, &p, - &ttd, &set); - if (error) - goto out; - - /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ - /* - * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, - * it'll return ESRCH. We should just return EINVAL. - */ - switch (uap->which) { - case CPU_WHICH_TID: - vm_domain_policy_localcopy(&vp, &ttd->td_vm_dom_policy); - break; - case CPU_WHICH_PID: - vm_domain_policy_localcopy(&vp, &p->p_vm_dom_policy); - break; - default: - error = EINVAL; - break; - } - if (p) - PROC_UNLOCK(p); - /* - * Copy out only the vm_domain_policy_entry part. - */ - if (error == 0) - error = copyout(&vp.p, uap->policy, sizeof(vp.p)); -out: - if (set) - cpuset_rel(set); - return (error); -} diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 28ef5242e437..29523d9a0fa4 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -57,8 +57,6 @@ __FBSDID("$FreeBSD$"); #include <sys/umtx.h> #include <sys/limits.h> -#include <vm/vm_domain.h> - #include <machine/frame.h> #include <security/audit/audit.h> @@ -260,12 +258,6 @@ thread_create(struct thread *td, struct rtprio *rtp, if (p->p_ptevents & PTRACE_LWP) newtd->td_dbgflags |= TDB_BORN; - /* - * Copy the existing thread VM policy into the new thread. - */ - vm_domain_policy_localcopy(&newtd->td_vm_dom_policy, - &td->td_vm_dom_policy); - PROC_UNLOCK(p); tidhash_add(newtd); diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index e932dcb9a545..d992f2c242aa 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -64,7 +64,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/uma.h> -#include <vm/vm_domain.h> #include <sys/eventhandler.h> /* @@ -78,13 +77,13 @@ __FBSDID("$FreeBSD$"); * structures. */ #ifdef __amd64__ -_Static_assert(offsetof(struct thread, td_flags) == 0xf4, +_Static_assert(offsetof(struct thread, td_flags) == 0xfc, "struct thread KBI td_flags"); -_Static_assert(offsetof(struct thread, td_pflags) == 0xfc, +_Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x460, +_Static_assert(offsetof(struct thread, td_frame) == 0x468, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x508, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x510, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -98,13 +97,13 @@ _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ -_Static_assert(offsetof(struct thread, td_flags) == 0x9c, +_Static_assert(offsetof(struct thread, td_flags) == 0x98, "struct thread KBI td_flags"); -_Static_assert(offsetof(struct thread, td_pflags) == 0xa4, +_Static_assert(offsetof(struct thread, td_pflags) == 0xa0, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x2ec, +_Static_assert(offsetof(struct thread, td_frame) == 0x2e4, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x338, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x330, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); @@ -413,7 +412,6 @@ thread_alloc(int pages) return (NULL); } cpu_thread_alloc(td); - vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } @@ -443,7 +441,6 @@ thread_free(struct thread *td) cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); - vm_domain_policy_cleanup(&td->td_vm_dom_policy); callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh index 7ea37de6e53c..22b6e6a57de2 100644 --- a/sys/kern/makesyscalls.sh +++ b/sys/kern/makesyscalls.sh @@ -139,6 +139,7 @@ sed -e ' printf "#include <sys/signal.h>\n" > sysarg printf "#include <sys/acl.h>\n" > sysarg printf "#include <sys/cpuset.h>\n" > sysarg + printf "#include <sys/domainset.h>\n" > sysarg printf "#include <sys/_ffcounter.h>\n" > sysarg printf "#include <sys/_semaphore.h>\n" > sysarg printf "#include <sys/ucontext.h>\n" > sysarg diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index f6b124d865cb..740a5f6626da 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -781,6 +781,7 @@ sched_fork_thread(struct thread *td, struct thread *childtd) childtd->td_lastcpu = NOCPU; childtd->td_lock = &sched_lock; childtd->td_cpuset = cpuset_ref(td->td_cpuset); + childtd->td_domain.dr_policy = td->td_cpuset->cs_domain; childtd->td_priority = childtd->td_base_pri; ts = td_get_sched(childtd); bzero(ts, sizeof(*ts)); diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index e1d2b5f34b60..cf861366fbd9 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -2131,6 +2131,7 @@ sched_fork_thread(struct thread *td, struct thread *child) child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); + child->td_domain.dr_policy = td->td_cpuset->cs_domain; ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c index 122477e9cc66..a41360a06121 100644 --- a/sys/kern/subr_kdb.c +++ b/sys/kern/subr_kdb.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kdb.h> #include <sys/kernel.h> #include <sys/malloc.h> +#include <sys/lock.h> #include <sys/pcpu.h> #include <sys/proc.h> #include <sys/sbuf.h> diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index b3785c6c7432..ade8b9cb58a0 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -997,12 +997,8 @@ 547 AUE_FUTIMESAT STD { int utimensat(int fd, \ char *path, \ struct timespec *times, int flag); } -548 AUE_NULL STD { int numa_getaffinity(cpuwhich_t which, \ - id_t id, \ - struct vm_domain_policy_entry *policy); } -549 AUE_NULL STD { int numa_setaffinity(cpuwhich_t which, \ - id_t id, const struct \ - vm_domain_policy_entry *policy); } +548 AUE_NULL UNIMPL numa_getaffinity +549 AUE_NULL UNIMPL numa_setaffinity 550 AUE_FSYNC STD { int fdatasync(int fd); } 551 AUE_FSTAT STD { int fstat(int fd, struct stat *sb); } 552 AUE_FSTATAT STD { int fstatat(int fd, char *path, \ @@ -1023,6 +1019,14 @@ struct kevent *changelist, int nchanges, \ struct kevent *eventlist, int nevents, \ const struct timespec *timeout); } +561 AUE_NULL STD { int cpuset_getdomain(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + size_t domainsetsize, domainset_t *mask, \ + int *policy); } +562 AUE_NULL STD { int cpuset_setdomain(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + size_t domainsetsize, domainset_t *mask, \ + int policy); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.c b/sys/netpfil/ipfw/dn_sched_fq_codel.c index cc5dc5a17203..44610aaf9740 100644 --- a/sys/netpfil/ipfw/dn_sched_fq_codel.c +++ b/sys/netpfil/ipfw/dn_sched_fq_codel.c @@ -44,6 +44,7 @@ #include <netinet/ip_fw.h> /* flow_id */ #include <netinet/ip_dummynet.h> +#include <sys/lock.h> #include <sys/proc.h> #include <sys/rwlock.h> diff --git a/sys/sys/_vm_domain.h b/sys/sys/_vm_domain.h deleted file mode 100644 index c34d737c9f6a..000000000000 --- a/sys/sys/_vm_domain.h +++ /dev/null @@ -1,63 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any - * redistribution must be conditioned upon including a substantially - * similar Disclaimer requirement for further binary redistribution. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY - * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, - * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER - * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGES. - * - * $FreeBSD$ - */ -#ifndef __SYS_VM_DOMAIN_H__ -#define __SYS_VM_DOMAIN_H__ - -#include <sys/seq.h> - -typedef enum { - VM_POLICY_NONE, - VM_POLICY_ROUND_ROBIN, - VM_POLICY_FIXED_DOMAIN, - VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN, - VM_POLICY_FIRST_TOUCH, - VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, - VM_POLICY_MAX -} vm_domain_policy_type_t; - -struct vm_domain_policy_entry { - vm_domain_policy_type_t policy; - int domain; -}; - -struct vm_domain_policy { - seq_t seq; - struct vm_domain_policy_entry p; -}; - -#define VM_DOMAIN_POLICY_STATIC_INITIALISER(vt, vd) \ - { .seq = 0, \ - .p.policy = vt, \ - .p.domain = vd } - -#endif /* __SYS_VM_DOMAIN_H__ */ diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h index 6ae989bbbe12..727209be76bf 100644 --- a/sys/sys/cpuset.h +++ b/sys/sys/cpuset.h @@ -112,6 +112,7 @@ LIST_HEAD(setlist, cpuset); */ struct cpuset { cpuset_t cs_mask; /* bitmask of valid cpus. */ + struct domainset *cs_domain; /* (c) NUMA policy. */ volatile u_int cs_ref; /* (a) Reference count. */ int cs_flags; /* (s) Flags from below. */ cpusetid_t cs_id; /* (s) Id or INVALID. */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index b5f2e7719f4e..9bc75db8591a 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -62,11 +62,18 @@ #include <sys/time.h> /* For structs itimerval, timeval. */ #else #include <sys/pcpu.h> +#include <sys/systm.h> #endif #include <sys/ucontext.h> #include <sys/ucred.h> -#include <sys/_vm_domain.h> +#include <sys/types.h> +#include <sys/domainset.h> + #include <machine/proc.h> /* Machine-dependent proc substruct. */ +#ifdef _KERNEL +#include <machine/cpu.h> +#endif + /* * One structure allocated per session. @@ -179,6 +186,7 @@ struct procdesc; struct racct; struct sbuf; struct sleepqueue; +struct socket; struct syscall_args; struct td_sched; struct thread; @@ -222,12 +230,12 @@ struct thread { TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ + struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ - struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals @@ -286,7 +294,6 @@ struct thread { pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ - int td_dom_rr_idx; /* (k) RR Numa domain selection. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ @@ -655,7 +662,6 @@ struct proc { uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ - struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index d95b2dd7d4bf..a0f8fc0be384 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -36,6 +36,7 @@ #include <sys/mac.h> #include <sys/mount.h> #include <sys/_cpuset.h> +#include <sys/_domainset.h> struct file; struct filecaps; @@ -96,6 +97,12 @@ int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp); +int kern_cpuset_getdomain(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t domainsetsize, + domainset_t *maskp, int *policyp); +int kern_cpuset_setdomain(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t domainsetsize, + const domainset_t *maskp, int policy); int kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpusetid_t *setid); int kern_cpuset_setid(struct thread *td, cpuwhich_t which, diff --git a/sys/vm/vm_domain.c b/sys/vm/vm_domain.c deleted file mode 100644 index 9fe44168cfad..000000000000 --- a/sys/vm/vm_domain.c +++ /dev/null @@ -1,514 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any - * redistribution must be conditioned upon including a substantially - * similar Disclaimer requirement for further binary redistribution. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY - * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, - * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER - * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGES. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include "opt_vm.h" -#include "opt_ddb.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/lock.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mutex.h> -#ifdef VM_NUMA_ALLOC -#include <sys/proc.h> -#endif -#include <sys/queue.h> -#include <sys/rwlock.h> -#include <sys/sbuf.h> -#include <sys/sysctl.h> -#include <sys/tree.h> -#include <sys/vmmeter.h> -#include <sys/seq.h> - -#include <ddb/ddb.h> - -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/vm_kern.h> -#include <vm/vm_object.h> -#include <vm/vm_page.h> -#include <vm/vm_phys.h> - -#include <vm/vm_domain.h> - -/* - * Default to first-touch + round-robin. - */ -static struct mtx vm_default_policy_mtx; -MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", - MTX_DEF); -#ifdef VM_NUMA_ALLOC -static struct vm_domain_policy vm_default_policy = - VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); -#else -/* Use round-robin so the domain policy code will only try once per allocation */ -static struct vm_domain_policy vm_default_policy = - VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); -#endif - -static int -sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) -{ - char policy_name[32]; - int error; - - mtx_lock(&vm_default_policy_mtx); - - /* Map policy to output string */ - switch (vm_default_policy.p.policy) { - case VM_POLICY_FIRST_TOUCH: - strcpy(policy_name, "first-touch"); - break; - case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: - strcpy(policy_name, "first-touch-rr"); - break; - case VM_POLICY_ROUND_ROBIN: - default: - strcpy(policy_name, "rr"); - break; - } - mtx_unlock(&vm_default_policy_mtx); - - error = sysctl_handle_string(oidp, &policy_name[0], - sizeof(policy_name), req); - if (error != 0 || req->newptr == NULL) - return (error); - - mtx_lock(&vm_default_policy_mtx); - /* Set: match on the subset of policies that make sense as a default */ - if (strcmp("first-touch-rr", policy_name) == 0) { - vm_domain_policy_set(&vm_default_policy, - VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); - } else if (strcmp("first-touch", policy_name) == 0) { - vm_domain_policy_set(&vm_default_policy, - VM_POLICY_FIRST_TOUCH, 0); - } else if (strcmp("rr", policy_name) == 0) { - vm_domain_policy_set(&vm_default_policy, - VM_POLICY_ROUND_ROBIN, 0); - } else { - error = EINVAL; - goto finish; - } - - error = 0; -finish: - mtx_unlock(&vm_default_policy_mtx); - return (error); -} - -SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, - 0, 0, sysctl_vm_default_policy, "A", - "Default policy (rr, first-touch, first-touch-rr"); - -/* - * Initialise a VM domain iterator. - * - * Check the thread policy, then the proc policy, - * then default to the system policy. - */ -void -vm_policy_iterator_init(struct vm_domain_iterator *vi) -{ -#ifdef VM_NUMA_ALLOC - struct vm_domain_policy lcl; -#endif - - vm_domain_iterator_init(vi); - -#ifdef VM_NUMA_ALLOC - /* Copy out the thread policy */ - vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); - if (lcl.p.policy != VM_POLICY_NONE) { - /* Thread policy is present; use it */ - vm_domain_iterator_set_policy(vi, &lcl); - return; - } - - vm_domain_policy_localcopy(&lcl, - &curthread->td_proc->p_vm_dom_policy); - if (lcl.p.policy != VM_POLICY_NONE) { - /* Process policy is present; use it */ - vm_domain_iterator_set_policy(vi, &lcl); - return; - } -#endif - /* Use system default policy */ - vm_domain_iterator_set_policy(vi, &vm_default_policy); -} - -void -vm_policy_iterator_finish(struct vm_domain_iterator *vi) -{ - - vm_domain_iterator_cleanup(vi); -} - -#ifdef VM_NUMA_ALLOC -static __inline int -vm_domain_rr_selectdomain(int skip_domain) -{ - struct thread *td; - - td = curthread; - - td->td_dom_rr_idx++; - td->td_dom_rr_idx %= vm_ndomains; - - /* - * If skip_domain is provided then skip over that - * domain. This is intended for round robin variants - * which first try a fixed domain. - */ - if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) { - td->td_dom_rr_idx++; - td->td_dom_rr_idx %= vm_ndomains; - } - return (td->td_dom_rr_idx); -} -#endif - -/* - * This implements a very simple set of VM domain memory allocation - * policies and iterators. - */ - -/* - * A VM domain policy represents a desired VM domain policy. - * Iterators implement searching through VM domains in a specific - * order. - */ - -/* - * When setting a policy, the caller must establish their own - * exclusive write protection for the contents of the domain - * policy. - */ -int -vm_domain_policy_init(struct vm_domain_policy *vp) -{ - - bzero(vp, sizeof(*vp)); - vp->p.policy = VM_POLICY_NONE; - vp->p.domain = -1; - return (0); -} - -int -vm_domain_policy_set(struct vm_domain_policy *vp, - vm_domain_policy_type_t vt, int domain) -{ - - seq_write_begin(&vp->seq); - vp->p.policy = vt; - vp->p.domain = domain; - seq_write_end(&vp->seq); - return (0); -} - -/* - * Take a local copy of a policy. - * - * The destination policy isn't write-barriered; this is used - * for doing local copies into something that isn't shared. - */ -void -vm_domain_policy_localcopy(struct vm_domain_policy *dst, - const struct vm_domain_policy *src) -{ - seq_t seq; - - for (;;) { - seq = seq_read(&src->seq); - *dst = *src; - if (seq_consistent(&src->seq, seq)) - return; - } -} - -/* - * Take a write-barrier copy of a policy. - * - * The destination policy is write -barriered; this is used - * for doing copies into policies that may be read by other - * threads. - */ -void -vm_domain_policy_copy(struct vm_domain_policy *dst, - const struct vm_domain_policy *src) -{ - seq_t seq; - struct vm_domain_policy d; - - for (;;) { - seq = seq_read(&src->seq); - d = *src; - if (seq_consistent(&src->seq, seq)) { - seq_write_begin(&dst->seq); - dst->p.domain = d.p.domain; - dst->p.policy = d.p.policy; - seq_write_end(&dst->seq); - return; - } - } -} - -int -vm_domain_policy_validate(const struct vm_domain_policy *vp) -{ - - switch (vp->p.policy) { - case VM_POLICY_NONE: - case VM_POLICY_ROUND_ROBIN: - case VM_POLICY_FIRST_TOUCH: - case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: - if (vp->p.domain == -1) - return (0); - return (-1); - case VM_POLICY_FIXED_DOMAIN: - case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: -#ifdef VM_NUMA_ALLOC - if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains) - return (0); -#else - if (vp->p.domain == 0) - return (0); -#endif - return (-1); - default: - return (-1); - } - return (-1); -} - -int -vm_domain_policy_cleanup(struct vm_domain_policy *vp) -{ - - /* For now, empty */ - return (0); -} - -int -vm_domain_iterator_init(struct vm_domain_iterator *vi) -{ - - /* Nothing to do for now */ - return (0); -} - -/* - * Manually setup an iterator with the given details. - */ -int -vm_domain_iterator_set(struct vm_domain_iterator *vi, - vm_domain_policy_type_t vt, int domain) -{ - -#ifdef VM_NUMA_ALLOC - switch (vt) { - case VM_POLICY_FIXED_DOMAIN: - vi->policy = VM_POLICY_FIXED_DOMAIN; - vi->domain = domain; - vi->n = 1; - break; - case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: - vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN; - vi->domain = domain; - vi->n = vm_ndomains; - break; - case VM_POLICY_FIRST_TOUCH: - vi->policy = VM_POLICY_FIRST_TOUCH; - vi->domain = PCPU_GET(domain); - vi->n = 1; - break; - case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: - vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN; - vi->domain = PCPU_GET(domain); - vi->n = vm_ndomains; - break; - case VM_POLICY_ROUND_ROBIN: - default: - vi->policy = VM_POLICY_ROUND_ROBIN; - vi->domain = -1; - vi->n = vm_ndomains; - break; - } -#else - vi->domain = 0; - vi->n = 1; -#endif - return (0); -} - -/* - * Setup an iterator based on the given policy. - */ -static inline void -_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, - const struct vm_domain_policy *vt) -{ - -#ifdef VM_NUMA_ALLOC - /* - * Initialise the iterator. - * - * For first-touch, the initial domain is set - * via the current thread CPU domain. - * - * For fixed-domain, it's assumed that the - * caller has initialised the specific domain - * it is after. - */ - switch (vt->p.policy) { - case VM_POLICY_FIXED_DOMAIN: - vi->policy = vt->p.policy; - vi->domain = vt->p.domain; - vi->n = 1; - break; - case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: - vi->policy = vt->p.policy; - vi->domain = vt->p.domain; - vi->n = vm_ndomains; - break; - case VM_POLICY_FIRST_TOUCH: - vi->policy = vt->p.policy; - vi->domain = PCPU_GET(domain); - vi->n = 1; - break; - case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: - vi->policy = vt->p.policy; - vi->domain = PCPU_GET(domain); - vi->n = vm_ndomains; - break; - case VM_POLICY_ROUND_ROBIN: - default: - /* - * Default to round-robin policy. - */ - vi->policy = VM_POLICY_ROUND_ROBIN; - vi->domain = -1; - vi->n = vm_ndomains; - break; - } -#else - vi->domain = 0; - vi->n = 1; -#endif -} - -void -vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, - const struct vm_domain_policy *vt) -{ - seq_t seq; - struct vm_domain_policy vt_lcl; - - for (;;) { - seq = seq_read(&vt->seq); - vt_lcl = *vt; - if (seq_consistent(&vt->seq, seq)) { - _vm_domain_iterator_set_policy(vi, &vt_lcl); - return; - } - } -} - -/* - * Return the next VM domain to use. - * - * Returns 0 w/ domain set to the next domain to use, or - * -1 to indicate no more domains are available. - */ -int -vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain) -{ - - /* General catch-all */ - if (vi->n <= 0) - return (-1); - -#ifdef VM_NUMA_ALLOC - switch (vi->policy) { - case VM_POLICY_FIXED_DOMAIN: - case VM_POLICY_FIRST_TOUCH: - *domain = vi->domain; - vi->n--; - break; - case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: - case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: - /* - * XXX TODO: skip over the rr'ed domain - * if it equals the one we started with. - */ - if (vi->n == vm_ndomains) - *domain = vi->domain; - else - *domain = vm_domain_rr_selectdomain(vi->domain); - vi->n--; - break; - case VM_POLICY_ROUND_ROBIN: - default: - *domain = vm_domain_rr_selectdomain(-1); - vi->n--; - break; - } -#else - *domain = 0; - vi->n--; -#endif - - return (0); -} - -/* - * Returns 1 if the iteration is done, or 0 if it has not. - - * This can only be called after at least one loop through - * the iterator. Ie, it's designed to be used as a tail - * check of a loop, not the head check of a loop. - */ -int -vm_domain_iterator_isdone(struct vm_domain_iterator *vi) -{ - - return (vi->n <= 0); -} - -int -vm_domain_iterator_cleanup(struct vm_domain_iterator *vi) -{ - - return (0); -} diff --git a/sys/vm/vm_domain.h b/sys/vm/vm_domain.h deleted file mode 100644 index 3b99c43c9101..000000000000 --- a/sys/vm/vm_domain.h +++ /dev/null @@ -1,71 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any - * redistribution must be conditioned upon including a substantially - * similar Disclaimer requirement for further binary redistribution. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY - * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, - * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER - * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGES. - * - * $FreeBSD$ - */ -#ifndef __VM_DOMAIN_H__ -#define __VM_DOMAIN_H__ - -#include <sys/_vm_domain.h> - -struct vm_domain_iterator { - vm_domain_policy_type_t policy; - int domain; - int n; -}; - -/* - * TODO: check to see if these should just become inline functions - * at some point. - */ -extern int vm_domain_policy_init(struct vm_domain_policy *vp); -extern int vm_domain_policy_set(struct vm_domain_policy *vp, - vm_domain_policy_type_t vt, int domain); -extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp); -extern void vm_domain_policy_localcopy(struct vm_domain_policy *dst, - const struct vm_domain_policy *src); -extern void vm_domain_policy_copy(struct vm_domain_policy *dst, - const struct vm_domain_policy *src); -extern int vm_domain_policy_validate(const struct vm_domain_policy *vp); - -extern int vm_domain_iterator_init(struct vm_domain_iterator *vi); -extern int vm_domain_iterator_set(struct vm_domain_iterator *vi, - vm_domain_policy_type_t vt, int domain); -extern void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, - const struct vm_domain_policy *vt); -extern int vm_domain_iterator_run(struct vm_domain_iterator *vi, - int *domain); -extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi); -extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi); - -extern void vm_policy_iterator_init(struct vm_domain_iterator *vi); -extern void vm_policy_iterator_finish(struct vm_domain_iterator *vi); - -#endif /* __VM_DOMAIN_H__ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index ece496407c2c..83e12a588ee7 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1589,6 +1589,7 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { + dst_object->domain = src_object->domain; dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 01be7db38a99..a3495f9861b0 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/cpuset.h> #include <sys/lock.h> #include <sys/mman.h> #include <sys/mount.h> @@ -1364,6 +1365,7 @@ vm_object_shadow( result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); + result->domain = source->domain; LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 @@ -1419,6 +1421,7 @@ vm_object_split(vm_map_entry_t entry) */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); + new_object->domain = orig_object->domain; source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 1edf2d59868d..1e3744ffe24f 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -74,6 +74,7 @@ #include <sys/_mutex.h> #include <sys/_pctrie.h> #include <sys/_rwlock.h> +#include <sys/_domainset.h> #include <vm/_vm_radix.h> @@ -102,6 +103,7 @@ struct vm_object { struct pglist memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ + struct domainset_ref domain; /* NUMA policy. */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 18a795ec0de8..6635d17a1ef9 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -91,6 +91,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/lock.h> +#include <sys/domainset.h> #include <sys/kernel.h> #include <sys/limits.h> #include <sys/linker.h> @@ -109,7 +110,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm.h> #include <vm/pmap.h> #include <vm/vm_param.h> -#include <vm/vm_domain.h> +#include <vm/vm_domainset.h> #include <vm/vm_kern.h> #include <vm/vm_object.h> #include <vm/vm_page.h> @@ -742,6 +743,12 @@ vm_page_startup(vm_offset_t vaddr) */ vm_reserv_init(); #endif + /* + * Set an initial domain policy for thread0 so that allocations + * can work. + */ + domainset_zero(); + return (vaddr); } @@ -1622,23 +1629,17 @@ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_page_t m; - int domain, wait; + int domain; - m = NULL; - vm_policy_iterator_init(&vi); - wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK); - req &= ~wait; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi)) - req |= wait; + vm_domainset_iter_page_init(&di, object, &domain, &req); + do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } @@ -1835,23 +1836,17 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_page_t m; - int domain, wait; + int domain; - m = NULL; - vm_policy_iterator_init(&vi); - wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK); - req &= ~wait; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi)) - req |= wait; + vm_domainset_iter_page_init(&di, object, &domain, &req); + do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } @@ -2045,22 +2040,16 @@ vm_page_alloc_check(vm_page_t m) vm_page_t vm_page_alloc_freelist(int freelist, int req) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_page_t m; - int domain, wait; + int domain; - m = NULL; - vm_policy_iterator_init(&vi); - wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK); - req &= ~wait; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi)) - req |= wait; + vm_domainset_iter_page_init(&di, kernel_object, &domain, &req); + do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } @@ -2562,8 +2551,8 @@ CTASSERT(powerof2(NRUNS)); * must be a power of two. */ bool -vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary) +vm_page_reclaim_contig_domain(int domain, int req, u_long npages, + vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; @@ -2603,8 +2592,8 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, curr_low = low; count = 0; for (;;) { - m_run = vm_phys_scan_contig(npages, curr_low, high, - alignment, boundary, options); + m_run = vm_phys_scan_contig(domain, npages, curr_low, + high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); @@ -2645,6 +2634,26 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, } } +bool +vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary) +{ + struct vm_domainset_iter di; + int domain; + bool ret; + + vm_domainset_iter_page_init(&di, kernel_object, &domain, &req); + do { + ret = vm_page_reclaim_contig_domain(domain, req, npages, low, + high, alignment, boundary); + if (ret) + break; + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); + + return (ret); +} + + /* * vm_wait: (also see VM_WAIT macro) * diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 70aa20c056d2..2b95b6209d22 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -229,6 +229,7 @@ struct vm_pagequeue { struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; + struct vmem *vmd_kernel_arena; u_int vmd_page_count; u_int vmd_free_count; long vmd_segs; /* bitmask of the segments */ @@ -514,7 +515,7 @@ void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); -bool vm_page_reclaim_contig_domain(int req, u_long npages, int domain, +bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); void vm_page_remove (vm_page_t); diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 6c796900fc18..26cfd7a77ae2 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_page.h> #include <vm/vm_phys.h> -#include <vm/vm_domain.h> - _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); @@ -973,7 +971,7 @@ vm_phys_free_contig(vm_page_t m, u_long npages) * be a power of two. */ vm_page_t -vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, +vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options) { vm_paddr_t pa_end; @@ -988,6 +986,8 @@ vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, return (NULL); for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; + if (seg->domain != domain) + continue; if (seg->start >= high) break; if (low >= seg->end) diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index f5524af39cac..e2ec2b2a814b 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -86,8 +86,8 @@ void vm_phys_free_contig(vm_page_t m, u_long npages); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); -vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary, int options); +vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, + vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options); void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); int vm_phys_mem_affinity(int f, int t); diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index 1726a76eb291..df568cafcb26 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -252,7 +252,8 @@ srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg) "enabled" : "disabled"); if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) break; - if (!overlaps_phys_avail(mem->BaseAddress, + if (mem->BaseAddress >= cpu_getmaxphyaddr() || + !overlaps_phys_avail(mem->BaseAddress, mem->BaseAddress + mem->Length)) { printf("SRAT: Ignoring memory at addr 0x%jx\n", (uintmax_t)mem->BaseAddress); |