aboutsummaryrefslogtreecommitdiff
path: root/sys/kern/vfs_cache.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/vfs_cache.c')
-rw-r--r--sys/kern/vfs_cache.c162
1 files changed, 109 insertions, 53 deletions
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 89c1d779f04c..13abb9171234 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -86,7 +86,7 @@
*
* This fundamental choice needs to be revisited. In the meantime, the current
* state is described below. Significance of all notable routines is explained
- * in comments placed above their implementation. Scattered thoroughout the
+ * in comments placed above their implementation. Scattered throughout the
* file are TODO comments indicating shortcomings which can be fixed without
* reworking everything (most of the fixes will likely be reusable). Various
* details are omitted from this explanation to not clutter the overview, they
@@ -109,18 +109,19 @@
* The (directory vnode; name) tuple reliably determines the target entry if
* it exists.
*
- * Since there are no small locks at this time (all are 32 bytes in size on
- * LP64), the code works around the problem by introducing lock arrays to
- * protect hash buckets and vnode lists.
+ * Since there were no small locks at the time of writing this comment (all are
+ * 32 bytes in size on LP64), the code works around the problem by introducing
+ * lock arrays to protect hash buckets and vnode lists.
*
* II. Filesystem integration
*
* Filesystems participating in name caching do the following:
* - set vop_lookup routine to vfs_cache_lookup
- * - set vop_cachedlookup to whatever can perform the lookup if the above fails
- * - if they support lockless lookup (see below), vop_fplookup_vexec and
- * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
- * mount point
+ * - set vop_cachedlookup to a routine which can perform the lookup if the
+ * above fails
+ * - if they support lockless lookup (see below), they set vop_fplookup_vexec
+ * and vop_fplookup_symlink along with the MNTK_FPLOOKUP flag on the mount
+ * point
* - call cache_purge or cache_vop_* routines to eliminate stale entries as
* applicable
* - call cache_enter to add entries depending on the MAKEENTRY flag
@@ -134,11 +135,15 @@
* ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
* vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
*
+ * You may notice a degree of CPU waste in this callchain.
+ *
* III. Performance considerations
*
* For lockless case forward lookup avoids any writes to shared areas apart
* from the terminal path component. In other words non-modifying lookups of
- * different files don't suffer any scalability problems in the namecache.
+ * different files don't suffer any scalability problems in the namecache
+ * itself.
+ *
* Looking up the same file is limited by VFS and goes beyond the scope of this
* file.
*
@@ -158,8 +163,10 @@
*
* IV. Observability
*
- * Note not everything has an explicit dtrace probe nor it should have, thus
- * some of the one-liners below depend on implementation details.
+ * Several statistics are collected in the vfs.cache sysctl tree.
+ *
+ * Some of the state can be checked for with explicit dtrace probes, must of it
+ * depends on implementation details.
*
* Examples:
*
@@ -167,7 +174,7 @@
* # line number, column 2 is status code (see cache_fpl_status)
* dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
*
- * # Lengths of names added by binary name
+ * # Histogram of lengths of names added, aggregated by which programs are doing it
* dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
*
* # Same as above but only those which exceed 64 characters
@@ -195,6 +202,11 @@
* - vnodes are subject to being recycled even if target inode is left in memory,
* which loses the name cache entries when it perhaps should not. in case of tmpfs
* names get duplicated -- kept by filesystem itself and namecache separately
+ * - vnode reclamation (see vnlru in kern/vfs_subr.c) defaults to skipping
+ * directories for this very reason, which arguably further reducing quality
+ * of vnode LRU. Per the above this is done to avoid breaking vnode -> path
+ * resolution (it becomes expensive for directories and impossible for the rest)
+ * This would not be a factor if namecache entries could persist without vnodes.
* - struct namecache has a fixed size and comes in 2 variants, often wasting
* space. now hard to replace with malloc due to dependence on SMR, which
* requires UMA zones to opt in
@@ -207,7 +219,8 @@
* performance left on the table, most notably from single-threaded standpoint.
* Below is a woefully incomplete list of changes which can help. Ideas are
* mostly sketched out, no claim is made all kinks or prerequisites are laid
- * out.
+ * out. The name of the game is eliding branches altogether and hopefully some
+ * of memory accesses.
*
* Note there is performance lost all over VFS.
*
@@ -223,13 +236,6 @@
* the vnode to hang around for the long haul, but would work for aforementioned
* stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
*
- * === hotpatching for sdt probes
- *
- * They result in *tons* of branches all over with rather regrettable codegen
- * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
- * Reworking the code to patch itself at runtime with asm goto would solve it.
- * asm goto is fully supported by gcc and clang.
- *
* === copyinstr
*
* On all architectures it operates one byte at a time, while it could be
@@ -251,10 +257,12 @@
* things worked out locklessly. Instead the lockless lookup could be the
* actual entry point which calls what is currently namei as a fallback.
*
+ * It could be hotpatched if lockless lookup is disabled.
+ *
* === avoidable branches in cache_can_fplookup
*
* The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
- * this is off, none of fplookup code should execute).
+ * this is off, none of fplookup code should execute, see above).
*
* Both audit and capsicum branches can be combined into one, but it requires
* paying off a lot of tech debt first.
@@ -277,8 +285,18 @@
*
* === inactive on v_usecount reaching 0
*
- * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such
- * processing with a bit in usecount.
+ * VOP_NEED_INACTIVE should not exist. Filesystems can indicate need for such
+ * processing with a bit in usecount and adding a hold count. Then vput fast path
+ * would become as simple as (ACHTUNG: locking ignored):
+ *
+ * ref = atomic_fetchadd_int(&vp->v_count, -1) - 1;
+ * if ((ref & MAGIC_BIT) == 0) // common case
+ * return;
+ * if (ref != 0) // the bit is set but this was not the last user
+ * return;
+ * // do inactive here
+ *
+ * Also see below.
*
* === v_holdcnt
*
@@ -287,7 +305,8 @@
* vnlru et al would consider the vnode not-freeable if has either hold or
* usecount on it.
*
- * This would eliminate 2 atomics.
+ * This would eliminate 2 atomics in the common case of securing a vnode and
+ * undoing it.
*/
static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
@@ -4632,7 +4651,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
}
/*
- * The target vnode is not supported, prepare for the slow path to take over.
+ * Prepare fallback to the locked lookup while trying to retain the progress.
*/
static int __noinline
cache_fplookup_partial_setup(struct cache_fpl *fpl)
@@ -6289,53 +6308,90 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
* Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
*
* Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
- * outlined below.
- *
- * Traditional vnode lookup conceptually looks like this:
+ * outlined at the end.
*
- * vn_lock(current);
- * for (;;) {
- * next = find();
- * vn_lock(next);
- * vn_unlock(current);
- * current = next;
- * if (last)
- * break;
- * }
- * return (current);
+ * Traversing from one vnode to another requires atomicity with regard to
+ * permissions, mount points and of course their relative placement (if you are
+ * looking up "bar" in "foo" and you found it, it better be in that directory
+ * at the time).
*
- * Each jump to the next vnode is safe memory-wise and atomic with respect to
- * any modifications thanks to holding respective locks.
+ * Normally this is accomplished with locking, but it comes with a significant
+ * performance hit and is untenable as a fast path even in a moderate core
+ * count environment (at the time of writing this comment this would be a
+ * little south of 100).
*
* The same guarantee can be provided with a combination of safe memory
* reclamation and sequence counters instead. If all operations which affect
* the relationship between the current vnode and the one we are looking for
* also modify the counter, we can verify whether all the conditions held as
- * we made the jump. This includes things like permissions, mount points etc.
- * Counter modification is provided by enclosing relevant places in
- * vn_seqc_write_begin()/end() calls.
+ * we made the jump.
*
- * Thus this translates to:
+ * See places which issue vn_seqc_write_begin()/vn_seqc_write_end() for
+ * operations affected.
+ *
+ * Suppose the variable "cnp" contains lookup metadata (the path etc.), then
+ * locked lookup conceptually looks like this:
+ *
+ * // lock the current directory
+ * vn_lock(dvp);
+ * for (;;) {
+ * // permission check
+ * if (!canlookup(dvp, cnp))
+ * abort();
+ * // look for the target name inside dvp
+ * tvp = findnext(dvp, cnp);
+ * vn_lock(tvp);
+ * // tvp is still guaranteed to be inside of dvp because of the lock on dvp
+ * vn_unlock(dvp);
+ * // dvp is unlocked. its state is now arbitrary, but that's fine as we
+ * // made the jump while everything relevant was correct, continue with tvp
+ * // as the directory to look up names in
+ * tvp = dvp;
+ * if (last)
+ * break;
+ * // if not last loop back and continue until done
+ * }
+ * vget(tvp);
+ * return (tvp);
+ *
+ * Lockless lookup replaces locking with sequence counter checks:
*
* vfs_smr_enter();
* dvp_seqc = seqc_read_any(dvp);
- * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
+ * // fail if someone is altering the directory vnode
+ * if (seqc_in_modify(dvp_seqc))
* abort();
* for (;;) {
- * tvp = find();
+ * // permission check. note it can race, but we will validate the outcome
+ * // with a seqc
+ * if (!canlookup_smr(dvp, cnp)) {
+ * // has dvp changed from under us? if so, the denial may be invalid
+ * if (!seqc_consistent(dvp, dvp_seqc)
+ * fallback_to_locked();
+ * // nothing changed, lookup denial is valid
+ * fail();
+ * }
+ * // look for the target name inside dvp
+ * tvp = findnext(dvp, cnp);
* tvp_seqc = seqc_read_any(tvp);
- * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
- * abort();
- * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
- * abort();
- * dvp = tvp; // we know nothing of importance has changed
- * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
+ * // bail if someone is altering the target vnode
+ * if (seqc_in_modify(tvp_seqc))
+ * fallback_to_locked();
+ * // bail if someone is altering the directory vnode
+ * if (!seqc_consistent(dvp, dvp_seqc)
+ * fallback_to_locked();
+ * // we confirmed neither dvp nor tvp changed while we were making the
+ * // jump to the next component, thus the result is the same as if we
+ * // held the lock on dvp and tvp the entire time, continue with tvp
+ * // as the directory to look up names in
+ * dvp = tvp;
+ * dvp_seqc = tvp_seqc;
* if (last)
* break;
* }
* vget(); // secure the vnode
* if (!seqc_consistent(tvp, tvp_seqc) // final check
- * abort();
+ * fallback_to_locked();
* // at this point we know nothing has changed for any parent<->child pair
* // as they were crossed during the lookup, meaning we matched the guarantee
* // of the locked variant