diff options
author | Konstantin Belousov <kib@FreeBSD.org> | 2020-09-09 22:12:51 +0000 |
---|---|---|
committer | Konstantin Belousov <kib@FreeBSD.org> | 2020-09-09 22:12:51 +0000 |
commit | d301b3580f60be417581db4b592f88ce9c916e4b (patch) | |
tree | dcc9c17a38f1081490426b2cf4ef0c0cd3d774d3 /sys/kern/uipc_shm.c | |
parent | aa8f9f90ff9ae6299edbf4073fd81900eed7a7fd (diff) | |
download | src-d301b3580f60be417581db4b592f88ce9c916e4b.tar.gz src-d301b3580f60be417581db4b592f88ce9c916e4b.zip |
Support for userspace non-transparent superpages (largepages).
Created with shm_open2(SHM_LARGEPAGE) and then configured with
FIOSSHMLPGCNF ioctl, largepages posix shared memory objects guarantee
that all userspace mappings of it are served by superpage non-managed
mappings.
Only amd64 for now, both 2M and 1G superpages can be requested, the
later requires CPU feature.
Reviewed by: markj
Tested by: pho
Sponsored by: The FreeBSD Foundation
MFC after: 1 week
Differential revision: https://reviews.freebsd.org/D24652
Notes
Notes:
svn path=/head/; revision=365522
Diffstat (limited to 'sys/kern/uipc_shm.c')
-rw-r--r-- | sys/kern/uipc_shm.c | 423 |
1 files changed, 405 insertions, 18 deletions
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 65178e894e64..6a1617b321d7 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -2,6 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson + * Copyright 2020 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -9,6 +10,9 @@ * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -80,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/sx.h> #include <sys/time.h> +#include <sys/vmmeter.h> #include <sys/vnode.h> #include <sys/unistd.h> #include <sys/user.h> @@ -120,6 +125,8 @@ static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); +static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, + void *rl_cookie); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, @@ -159,11 +166,19 @@ struct fileops shm_ops = { .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_fallocate = shm_fallocate, - .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, }; FEATURE(posix_shm, "POSIX shared memory"); +static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + ""); + +static int largepage_reclaim_tries = 1; +SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, + CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, + "Number of contig reclaims before giving up for default alloc policy"); + static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { @@ -242,6 +257,89 @@ uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) return (error); } +static u_long count_largepages[MAXPAGESIZES]; + +static int +shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + vm_page_t m; + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pidx >= object->size) + return (VM_PAGER_FAIL); + *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); + + /* + * We only busy the first page in the superpage run. It is + * useless to busy whole run since we only remove full + * superpage, and it takes too long to busy e.g. 512 * 512 == + * 262144 pages constituing 1G amd64 superage. + */ + m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); + MPASS(m != NULL); + + *last = *first + atop(pagesizes[psind]) - 1; + return (VM_PAGER_OK); +} + +static boolean_t +shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, + int *before, int *after) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pindex >= object->size) + return (FALSE); + if (before != NULL) { + *before = pindex - rounddown2(pindex, pagesizes[psind] / + PAGE_SIZE); + } + if (after != NULL) { + *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - + pindex; + } + return (TRUE); +} + +static void +shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred) +{ +} + +static void +shm_largepage_phys_dtor(vm_object_t object) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind != 0) { + atomic_subtract_long(&count_largepages[psind], + object->size / (pagesizes[psind] / PAGE_SIZE)); + vm_wire_sub(object->size); + } else { + KASSERT(object->size == 0, + ("largepage phys obj %p not initialized bit size %#jx > 0", + object, (uintmax_t)object->size)); + } +} + +static struct phys_pager_ops shm_largepage_phys_ops = { + .phys_pg_populate = shm_largepage_phys_populate, + .phys_pg_haspage = shm_largepage_phys_haspage, + .phys_pg_ctor = shm_largepage_phys_ctor, + .phys_pg_dtor = shm_largepage_phys_dtor, +}; + +bool +shm_largepage(struct shmfd *shmfd) +{ + return (shmfd->shm_object->type == OBJT_PHYS); +} + static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { @@ -321,6 +419,8 @@ shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, if (error) return (error); #endif + if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) + return (EINVAL); foffset_lock_uio(fp, uio, flags); if (uio->uio_resid > OFF_MAX - uio->uio_offset) { /* @@ -385,7 +485,11 @@ int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { + struct shmfd *shmfd; + struct shm_largepage_conf *conf; + void *rl_cookie; + shmfd = fp->f_data; switch (com) { case FIONBIO: case FIOASYNC: @@ -394,6 +498,38 @@ shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, * just like it would on an unlinked regular file */ return (0); + case FIOSSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + if (shmfd->shm_lp_psind != 0 && + conf->psind != shmfd->shm_lp_psind) + return (EINVAL); + if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || + pagesizes[conf->psind] == 0) + return (EINVAL); + if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) + return (EINVAL); + + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + shmfd->shm_lp_psind = conf->psind; + shmfd->shm_lp_alloc_policy = conf->alloc_policy; + shmfd->shm_object->un_pager.phys.data_val = conf->psind; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); + case FIOGSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + conf->psind = shmfd->shm_lp_psind; + conf->alloc_policy = shmfd->shm_lp_alloc_policy; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); default: return (ENOTTY); } @@ -436,6 +572,8 @@ shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; + sb->st_blocks = shmfd->shm_object->size / + (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); return (0); } @@ -592,6 +730,108 @@ retry: return (0); } +static int +shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t newobjsz, oldobjsz; + int aflags, error, i, psind, try; + + KASSERT(length >= 0, ("shm_dotruncate: length < 0")); + object = shmfd->shm_object; + VM_OBJECT_ASSERT_WLOCKED(object); + rangelock_cookie_assert(rl_cookie, RA_WLOCKED); + + oldobjsz = object->size; + newobjsz = OFF_TO_IDX(length); + if (length == shmfd->shm_size) + return (0); + psind = shmfd->shm_lp_psind; + if (psind == 0 && length != 0) + return (EINVAL); + if ((length & (pagesizes[psind] - 1)) != 0) + return (EINVAL); + + if (length < shmfd->shm_size) { + if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) + return (EPERM); + if (shmfd->shm_kmappings > 0) + return (EBUSY); + return (ENOTSUP); /* Pages are unmanaged. */ +#if 0 + vm_object_page_remove(object, newobjsz, oldobjsz, 0); + object->size = newobjsz; + shmfd->shm_size = length; + return (0); +#endif + } + + aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; + if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) + aflags |= VM_ALLOC_WAITFAIL; + try = 0; + + /* + * Extend shmfd and object, keeping all already fully + * allocated large pages intact even on error, because dropped + * object lock might allowed mapping of them. + */ + while (object->size < newobjsz) { + m = vm_page_alloc_contig(object, object->size, aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0, + VM_MEMATTR_DEFAULT); + if (m == NULL) { + VM_OBJECT_WUNLOCK(object); + if (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_NOWAIT || + (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_DEFAULT && + try >= largepage_reclaim_tries)) { + VM_OBJECT_WLOCK(object); + return (ENOMEM); + } + error = vm_page_reclaim_contig(aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0) ? 0 : + vm_wait_intr(object); + if (error != 0) { + VM_OBJECT_WLOCK(object); + return (error); + } + try++; + VM_OBJECT_WLOCK(object); + continue; + } + try = 0; + for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { + if ((m[i].flags & PG_ZERO) == 0) + pmap_zero_page(&m[i]); + vm_page_valid(&m[i]); + vm_page_xunbusy(&m[i]); + } + object->size += OFF_TO_IDX(pagesizes[psind]); + shmfd->shm_size += pagesizes[psind]; + atomic_add_long(&count_largepages[psind], 1); + vm_wire_add(atop(pagesizes[psind])); + } + return (0); +} + +static int +shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + int error; + + VM_OBJECT_WLOCK(shmfd->shm_object); + error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, + length, rl_cookie) : shm_dotruncate_locked(shmfd, length, + rl_cookie); + VM_OBJECT_WUNLOCK(shmfd->shm_object); + return (error); +} + int shm_dotruncate(struct shmfd *shmfd, off_t length) { @@ -600,9 +840,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, length, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); + error = shm_dotruncate_cookie(shmfd, length, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } @@ -612,7 +850,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) * routines. */ struct shmfd * -shm_alloc(struct ucred *ucred, mode_t mode) +shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) { struct shmfd *shmfd; @@ -621,8 +859,15 @@ shm_alloc(struct ucred *ucred, mode_t mode) shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; - shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, - shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + if (largepage) { + shmfd->shm_object = phys_pager_allocate(NULL, + &shm_largepage_phys_ops, NULL, shmfd->shm_size, + VM_PROT_DEFAULT, 0, ucred); + shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + } else { + shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, + shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + } KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = @@ -687,6 +932,8 @@ shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) static void shm_init(void *arg) { + char name[32]; + int i; mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); @@ -694,6 +941,24 @@ shm_init(void *arg) new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); + + for (i = 1; i < MAXPAGESIZES; i++) { + if (pagesizes[i] == 0) + break; +#define M (1024 * 1024) +#define G (1024 * M) + if (pagesizes[i] >= G) + snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); + else if (pagesizes[i] >= M) + snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); + else + snprintf(name, sizeof(name), "%lu", pagesizes[i]); +#undef G +#undef M + SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), + OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], + "number of non-transient largepages allocated"); + } } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); @@ -773,8 +1038,10 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; + bool largepage; - if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE)) != 0) + if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | + SHM_LARGEPAGE)) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; @@ -798,6 +1065,12 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); + largepage = (shmflags & SHM_LARGEPAGE) != 0; +#if !defined(__amd64__) + if (largepage) + return (ENOTTY); +#endif + /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be @@ -831,7 +1104,7 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, fdrop(fp, td); return (EINVAL); } - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; } else { @@ -854,7 +1127,8 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, path); if (error == 0) { #endif - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, + largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; shm_insert(path, fnv, shmfd); @@ -1139,7 +1413,118 @@ out: return (error); } -int +static int +shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, + vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, + vm_ooffset_t foff, bool writecounted, struct thread *td) +{ + struct vmspace *vms; + vm_map_entry_t next_entry, prev_entry; + vm_offset_t align, mask, maxaddr; + int docow, error, rv, try; + bool curmap; + + if (shmfd->shm_lp_psind == 0) + return (EINVAL); + + /* MAP_PRIVATE is disabled */ + if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | + MAP_NOCORE | +#ifdef MAP_32BIT + MAP_32BIT | +#endif + MAP_ALIGNMENT_MASK)) != 0) + return (EINVAL); + + vms = td->td_proc->p_vmspace; + curmap = map == &vms->vm_map; + if (curmap) { + error = kern_mmap_racct_check(td, map, size); + if (error != 0) + return (error); + } + + docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; + docow |= MAP_INHERIT_SHARE; + if ((flags & MAP_NOCORE) != 0) + docow |= MAP_DISABLE_COREDUMP; + if (writecounted) + docow |= MAP_WRITECOUNT; + + mask = pagesizes[shmfd->shm_lp_psind] - 1; + if ((foff & mask) != 0) + return (EINVAL); + maxaddr = vm_map_max(map); +#ifdef MAP_32BIT + if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) + maxaddr = MAP_32BIT_MAX_ADDR; +#endif + if (size == 0 || (size & mask) != 0 || + (*addr != 0 && ((*addr & mask) != 0 || + *addr + size < *addr || *addr + size > maxaddr))) + return (EINVAL); + + align = flags & MAP_ALIGNMENT_MASK; + if (align == 0) { + align = pagesizes[shmfd->shm_lp_psind]; + } else if (align == MAP_ALIGNED_SUPER) { + if (shmfd->shm_lp_psind != 1) + return (EINVAL); + align = pagesizes[1]; + } else { + align >>= MAP_ALIGNMENT_SHIFT; + align = 1ULL << align; + /* Also handles overflow. */ + if (align < pagesizes[shmfd->shm_lp_psind]) + return (EINVAL); + } + + vm_map_lock(map); + if ((flags & MAP_FIXED) == 0) { + try = 1; + if (curmap && (*addr == 0 || + (*addr >= round_page((vm_offset_t)vms->vm_taddr) && + *addr < round_page((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA))))) { + *addr = roundup2((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA), + pagesizes[shmfd->shm_lp_psind]); + } +again: + rv = vm_map_find_aligned(map, addr, size, maxaddr, align); + if (rv != KERN_SUCCESS) { + if (try == 1) { + try = 2; + *addr = vm_map_min(map); + if ((*addr & mask) != 0) + *addr = (*addr + mask) & mask; + goto again; + } + goto fail1; + } + } else if ((flags & MAP_EXCL) == 0) { + rv = vm_map_delete(map, *addr, *addr + size); + if (rv != KERN_SUCCESS) + goto fail1; + } else { + error = ENOSPC; + if (vm_map_lookup_entry(map, *addr, &prev_entry)) + goto fail; + next_entry = vm_map_entry_succ(prev_entry); + if (next_entry->start < *addr + size) + goto fail; + } + + rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, + prot, max_prot, docow); +fail1: + error = vm_mmap_to_errno(rv); +fail: + vm_map_unlock(map); + return (error); +} + +static int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) @@ -1211,8 +1596,13 @@ shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, if (writecnt) vm_pager_update_writecount(shmfd->shm_object, 0, objsize); - error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, - shmfd->shm_object, foff, writecnt, td); + if (shm_largepage(shmfd)) { + error = shm_mmap_large(shmfd, map, addr, objsize, prot, + maxprot, flags, foff, writecnt, td); + } else { + error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, + shmfd->shm_object, foff, writecnt, td); + } if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, @@ -1503,11 +1893,8 @@ shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) */ rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, &shmfd->shm_mtx); - if (size > shmfd->shm_size) { - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, size, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); - } + if (size > shmfd->shm_size) + error = shm_dotruncate_cookie(shmfd, size, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); /* Translate to posix_fallocate(2) return value as needed. */ if (error == ENOMEM) |