aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMateusz Guzik <mjg@FreeBSD.org>2021-03-17 21:33:47 +0000
committerMateusz Guzik <mjg@FreeBSD.org>2021-03-18 14:59:03 +0000
commite9272225e6bed840b00eef1c817b188c172338ee (patch)
treeb1e3f314416e8626886eb32757ea3b0a400dcf99
parent21864048f3929192bd20f34145ba62cda6e1d4f9 (diff)
downloadsrc-e9272225e6bed840b00eef1c817b188c172338ee.tar.gz
src-e9272225e6bed840b00eef1c817b188c172338ee.zip
vfs: fix vnlru marker handling for filtered/unfiltered cases
The global list has a marker with an invariant that free vnodes are placed somewhere past that. A caller which performs filtering (like ZFS) can move said marker all the way to the end, across free vnodes which don't match. Then a caller which does not perform filtering will fail to find them. This makes vn_alloc_hard sleep for 1 second instead of reclaiming, resulting in significant stalls. Fix the problem by requiring an explicit marker by callers which do filtering. As a temporary measure extend vnlru_free to restart if it fails to reclaim anything. Big thanks go to the reporter for testing several iterations of the patch. Reported by: Yamagi <lists yamagi.org> Tested by: Yamagi <lists yamagi.org> Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D29324
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c14
-rw-r--r--sys/kern/vfs_subr.c72
-rw-r--r--sys/sys/vnode.h3
3 files changed, 80 insertions, 9 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
index 4fc7468bfa47..0d5cffbe8d1e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -51,6 +51,9 @@
#include <sys/vm.h>
#include <sys/vmmeter.h>
+static struct sx arc_vnlru_lock;
+static struct vnode *arc_vnlru_marker;
+
extern struct vfsops zfs_vfsops;
uint_t zfs_arc_free_target = 0;
@@ -157,7 +160,9 @@ arc_prune_task(void *arg)
arc_reduce_target_size(ptob(nr_scan));
free(arg, M_TEMP);
- vnlru_free(nr_scan, &zfs_vfsops);
+ sx_xlock(&arc_vnlru_lock);
+ vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker);
+ sx_xunlock(&arc_vnlru_lock);
}
/*
@@ -234,7 +239,8 @@ arc_lowmem_init(void)
{
arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
EVENTHANDLER_PRI_FIRST);
-
+ arc_vnlru_marker = vnlru_alloc_marker();
+ sx_init(&arc_vnlru_lock, "arc vnlru lock");
}
void
@@ -242,6 +248,10 @@ arc_lowmem_fini(void)
{
if (arc_event_lowmem != NULL)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+ if (arc_vnlru_marker != NULL) {
+ vnlru_free_marker(arc_vnlru_marker);
+ sx_destroy(&arc_vnlru_lock);
+ }
}
void
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 37e713ee48ea..af12252e8a88 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1216,9 +1216,9 @@ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
* Attempt to reduce the free list by the requested amount.
*/
static int
-vnlru_free_locked(int count, struct vfsops *mnt_op)
+vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
{
- struct vnode *vp, *mvp;
+ struct vnode *vp;
struct mount *mp;
int ocount;
@@ -1226,7 +1226,6 @@ vnlru_free_locked(int count, struct vfsops *mnt_op)
if (count > max_vnlru_free)
count = max_vnlru_free;
ocount = count;
- mvp = vnode_list_free_marker;
vp = mvp;
for (;;) {
if (count == 0) {
@@ -1268,13 +1267,72 @@ vnlru_free_locked(int count, struct vfsops *mnt_op)
return (ocount - count);
}
+static int
+vnlru_free_locked(int count)
+{
+
+ mtx_assert(&vnode_list_mtx, MA_OWNED);
+ return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
+}
+
+void
+vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
+{
+
+ MPASS(mnt_op != NULL);
+ MPASS(mvp != NULL);
+ VNPASS(mvp->v_type == VMARKER, mvp);
+ mtx_lock(&vnode_list_mtx);
+ vnlru_free_impl(count, mnt_op, mvp);
+ mtx_unlock(&vnode_list_mtx);
+}
+
+/*
+ * Temporary binary compat, don't use. Call vnlru_free_vfsops instead.
+ */
void
vnlru_free(int count, struct vfsops *mnt_op)
{
+ struct vnode *mvp;
+
+ if (count == 0)
+ return;
+ mtx_lock(&vnode_list_mtx);
+ mvp = vnode_list_free_marker;
+ if (vnlru_free_impl(count, mnt_op, mvp) == 0) {
+ /*
+ * It is possible the marker was moved over eligible vnodes by
+ * callers which filtered by different ops. If so, start from
+ * scratch.
+ */
+ if (vnlru_read_freevnodes() > 0) {
+ TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
+ TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
+ }
+ vnlru_free_impl(count, mnt_op, mvp);
+ }
+ mtx_unlock(&vnode_list_mtx);
+}
+
+struct vnode *
+vnlru_alloc_marker(void)
+{
+ struct vnode *mvp;
+ mvp = vn_alloc_marker(NULL);
+ mtx_lock(&vnode_list_mtx);
+ TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
+ mtx_unlock(&vnode_list_mtx);
+ return (mvp);
+}
+
+void
+vnlru_free_marker(struct vnode *mvp)
+{
mtx_lock(&vnode_list_mtx);
- vnlru_free_locked(count, mnt_op);
+ TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
+ vn_free_marker(mvp);
}
static void
@@ -1423,7 +1481,7 @@ vnlru_proc(void)
* try to reduce it by discarding from the free list.
*/
if (rnumvnodes > desiredvnodes) {
- vnlru_free_locked(rnumvnodes - desiredvnodes, NULL);
+ vnlru_free_locked(rnumvnodes - desiredvnodes);
rnumvnodes = atomic_load_long(&numvnodes);
}
/*
@@ -1615,7 +1673,7 @@ vn_alloc_hard(struct mount *mp)
* should be chosen so that we never wait or even reclaim from
* the free list to below its target minimum.
*/
- if (vnlru_free_locked(1, NULL) > 0)
+ if (vnlru_free_locked(1) > 0)
goto alloc;
if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
/*
@@ -1625,7 +1683,7 @@ vn_alloc_hard(struct mount *mp)
msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
vnlru_read_freevnodes() > 1)
- vnlru_free_locked(1, NULL);
+ vnlru_free_locked(1);
}
alloc:
rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 0e46bea14b64..f05b03c74c82 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -826,7 +826,10 @@ void vfs_timestamp(struct timespec *);
void vfs_write_resume(struct mount *mp, int flags);
int vfs_write_suspend(struct mount *mp, int flags);
int vfs_write_suspend_umnt(struct mount *mp);
+struct vnode *vnlru_alloc_marker(void);
+void vnlru_free_marker(struct vnode *);
void vnlru_free(int, struct vfsops *);
+void vnlru_free_vfsops(int, struct vfsops *, struct vnode *);
int vop_stdbmap(struct vop_bmap_args *);
int vop_stdfdatasync_buf(struct vop_fdatasync_args *);
int vop_stdfsync(struct vop_fsync_args *);