diff options
Diffstat (limited to 'sys/kern/vfs_cache.c')
-rw-r--r-- | sys/kern/vfs_cache.c | 162 |
1 files changed, 109 insertions, 53 deletions
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 89c1d779f04c..13abb9171234 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -86,7 +86,7 @@ * * This fundamental choice needs to be revisited. In the meantime, the current * state is described below. Significance of all notable routines is explained - * in comments placed above their implementation. Scattered thoroughout the + * in comments placed above their implementation. Scattered throughout the * file are TODO comments indicating shortcomings which can be fixed without * reworking everything (most of the fixes will likely be reusable). Various * details are omitted from this explanation to not clutter the overview, they @@ -109,18 +109,19 @@ * The (directory vnode; name) tuple reliably determines the target entry if * it exists. * - * Since there are no small locks at this time (all are 32 bytes in size on - * LP64), the code works around the problem by introducing lock arrays to - * protect hash buckets and vnode lists. + * Since there were no small locks at the time of writing this comment (all are + * 32 bytes in size on LP64), the code works around the problem by introducing + * lock arrays to protect hash buckets and vnode lists. * * II. Filesystem integration * * Filesystems participating in name caching do the following: * - set vop_lookup routine to vfs_cache_lookup - * - set vop_cachedlookup to whatever can perform the lookup if the above fails - * - if they support lockless lookup (see below), vop_fplookup_vexec and - * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the - * mount point + * - set vop_cachedlookup to a routine which can perform the lookup if the + * above fails + * - if they support lockless lookup (see below), they set vop_fplookup_vexec + * and vop_fplookup_symlink along with the MNTK_FPLOOKUP flag on the mount + * point * - call cache_purge or cache_vop_* routines to eliminate stale entries as * applicable * - call cache_enter to add entries depending on the MAKEENTRY flag @@ -134,11 +135,15 @@ * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter * + * You may notice a degree of CPU waste in this callchain. + * * III. Performance considerations * * For lockless case forward lookup avoids any writes to shared areas apart * from the terminal path component. In other words non-modifying lookups of - * different files don't suffer any scalability problems in the namecache. + * different files don't suffer any scalability problems in the namecache + * itself. + * * Looking up the same file is limited by VFS and goes beyond the scope of this * file. * @@ -158,8 +163,10 @@ * * IV. Observability * - * Note not everything has an explicit dtrace probe nor it should have, thus - * some of the one-liners below depend on implementation details. + * Several statistics are collected in the vfs.cache sysctl tree. + * + * Some of the state can be checked for with explicit dtrace probes, must of it + * depends on implementation details. * * Examples: * @@ -167,7 +174,7 @@ * # line number, column 2 is status code (see cache_fpl_status) * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' * - * # Lengths of names added by binary name + * # Histogram of lengths of names added, aggregated by which programs are doing it * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' * * # Same as above but only those which exceed 64 characters @@ -195,6 +202,11 @@ * - vnodes are subject to being recycled even if target inode is left in memory, * which loses the name cache entries when it perhaps should not. in case of tmpfs * names get duplicated -- kept by filesystem itself and namecache separately + * - vnode reclamation (see vnlru in kern/vfs_subr.c) defaults to skipping + * directories for this very reason, which arguably further reducing quality + * of vnode LRU. Per the above this is done to avoid breaking vnode -> path + * resolution (it becomes expensive for directories and impossible for the rest) + * This would not be a factor if namecache entries could persist without vnodes. * - struct namecache has a fixed size and comes in 2 variants, often wasting * space. now hard to replace with malloc due to dependence on SMR, which * requires UMA zones to opt in @@ -207,7 +219,8 @@ * performance left on the table, most notably from single-threaded standpoint. * Below is a woefully incomplete list of changes which can help. Ideas are * mostly sketched out, no claim is made all kinks or prerequisites are laid - * out. + * out. The name of the game is eliding branches altogether and hopefully some + * of memory accesses. * * Note there is performance lost all over VFS. * @@ -223,13 +236,6 @@ * the vnode to hang around for the long haul, but would work for aforementioned * stat(2) but also access(2), readlink(2), realpathat(2) and probably more. * - * === hotpatching for sdt probes - * - * They result in *tons* of branches all over with rather regrettable codegen - * at times. Removing sdt probes altogether gives over 2% boost in lookup rate. - * Reworking the code to patch itself at runtime with asm goto would solve it. - * asm goto is fully supported by gcc and clang. - * * === copyinstr * * On all architectures it operates one byte at a time, while it could be @@ -251,10 +257,12 @@ * things worked out locklessly. Instead the lockless lookup could be the * actual entry point which calls what is currently namei as a fallback. * + * It could be hotpatched if lockless lookup is disabled. + * * === avoidable branches in cache_can_fplookup * * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if - * this is off, none of fplookup code should execute). + * this is off, none of fplookup code should execute, see above). * * Both audit and capsicum branches can be combined into one, but it requires * paying off a lot of tech debt first. @@ -277,8 +285,18 @@ * * === inactive on v_usecount reaching 0 * - * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such - * processing with a bit in usecount. + * VOP_NEED_INACTIVE should not exist. Filesystems can indicate need for such + * processing with a bit in usecount and adding a hold count. Then vput fast path + * would become as simple as (ACHTUNG: locking ignored): + * + * ref = atomic_fetchadd_int(&vp->v_count, -1) - 1; + * if ((ref & MAGIC_BIT) == 0) // common case + * return; + * if (ref != 0) // the bit is set but this was not the last user + * return; + * // do inactive here + * + * Also see below. * * === v_holdcnt * @@ -287,7 +305,8 @@ * vnlru et al would consider the vnode not-freeable if has either hold or * usecount on it. * - * This would eliminate 2 atomics. + * This would eliminate 2 atomics in the common case of securing a vnode and + * undoing it. */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, @@ -4632,7 +4651,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, } /* - * The target vnode is not supported, prepare for the slow path to take over. + * Prepare fallback to the locked lookup while trying to retain the progress. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) @@ -6289,53 +6308,90 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria - * outlined below. - * - * Traditional vnode lookup conceptually looks like this: + * outlined at the end. * - * vn_lock(current); - * for (;;) { - * next = find(); - * vn_lock(next); - * vn_unlock(current); - * current = next; - * if (last) - * break; - * } - * return (current); + * Traversing from one vnode to another requires atomicity with regard to + * permissions, mount points and of course their relative placement (if you are + * looking up "bar" in "foo" and you found it, it better be in that directory + * at the time). * - * Each jump to the next vnode is safe memory-wise and atomic with respect to - * any modifications thanks to holding respective locks. + * Normally this is accomplished with locking, but it comes with a significant + * performance hit and is untenable as a fast path even in a moderate core + * count environment (at the time of writing this comment this would be a + * little south of 100). * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as - * we made the jump. This includes things like permissions, mount points etc. - * Counter modification is provided by enclosing relevant places in - * vn_seqc_write_begin()/end() calls. + * we made the jump. * - * Thus this translates to: + * See places which issue vn_seqc_write_begin()/vn_seqc_write_end() for + * operations affected. + * + * Suppose the variable "cnp" contains lookup metadata (the path etc.), then + * locked lookup conceptually looks like this: + * + * // lock the current directory + * vn_lock(dvp); + * for (;;) { + * // permission check + * if (!canlookup(dvp, cnp)) + * abort(); + * // look for the target name inside dvp + * tvp = findnext(dvp, cnp); + * vn_lock(tvp); + * // tvp is still guaranteed to be inside of dvp because of the lock on dvp + * vn_unlock(dvp); + * // dvp is unlocked. its state is now arbitrary, but that's fine as we + * // made the jump while everything relevant was correct, continue with tvp + * // as the directory to look up names in + * tvp = dvp; + * if (last) + * break; + * // if not last loop back and continue until done + * } + * vget(tvp); + * return (tvp); + * + * Lockless lookup replaces locking with sequence counter checks: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); - * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode + * // fail if someone is altering the directory vnode + * if (seqc_in_modify(dvp_seqc)) * abort(); * for (;;) { - * tvp = find(); + * // permission check. note it can race, but we will validate the outcome + * // with a seqc + * if (!canlookup_smr(dvp, cnp)) { + * // has dvp changed from under us? if so, the denial may be invalid + * if (!seqc_consistent(dvp, dvp_seqc) + * fallback_to_locked(); + * // nothing changed, lookup denial is valid + * fail(); + * } + * // look for the target name inside dvp + * tvp = findnext(dvp, cnp); * tvp_seqc = seqc_read_any(tvp); - * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode - * abort(); - * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode - * abort(); - * dvp = tvp; // we know nothing of importance has changed - * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration + * // bail if someone is altering the target vnode + * if (seqc_in_modify(tvp_seqc)) + * fallback_to_locked(); + * // bail if someone is altering the directory vnode + * if (!seqc_consistent(dvp, dvp_seqc) + * fallback_to_locked(); + * // we confirmed neither dvp nor tvp changed while we were making the + * // jump to the next component, thus the result is the same as if we + * // held the lock on dvp and tvp the entire time, continue with tvp + * // as the directory to look up names in + * dvp = tvp; + * dvp_seqc = tvp_seqc; * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check - * abort(); + * fallback_to_locked(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant |