aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c')
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c209
1 files changed, 124 insertions, 85 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index ed9721dade76..6106726651a3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -840,8 +841,8 @@ out:
*zpp = zp;
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1202,8 +1203,8 @@ out:
zfs_zrele_async(xzp);
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1391,14 +1392,15 @@ out:
zfs_dirent_unlock(dl);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
if (error != 0) {
zrele(zp);
} else {
zfs_znode_update_vfs(dzp);
zfs_znode_update_vfs(zp);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
+
}
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1527,8 +1529,8 @@ out:
zfs_znode_update_vfs(zp);
zrele(zp);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -2482,10 +2484,10 @@ top:
new_mode = zp->z_mode;
}
err = zfs_acl_chown_setattr(zp);
- ASSERT(err == 0);
+ ASSERT0(err);
if (attrzp) {
err = zfs_acl_chown_setattr(attrzp);
- ASSERT(err == 0);
+ ASSERT0(err);
}
}
@@ -2599,7 +2601,7 @@ out:
if (err == 0 && xattr_count > 0) {
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
xattr_count, tx);
- ASSERT(err2 == 0);
+ ASSERT0(err2);
}
if (aclp)
@@ -2629,8 +2631,8 @@ out:
}
out2:
- if (os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
+ err = zil_commit(zilog, 0);
out3:
kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
@@ -3156,7 +3158,7 @@ top:
* zfs_link_create() to add back the same entry, but with a new
* dnode (szp), should not fail.
*/
- ASSERT3P(tzp, ==, NULL);
+ ASSERT0P(tzp);
goto commit_link_tzp;
}
@@ -3234,8 +3236,8 @@ out:
zfs_dirent_unlock(sdl);
zfs_dirent_unlock(tdl);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -3435,7 +3437,7 @@ top:
*zpp = zp;
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ error = zil_commit(zilog, 0);
} else {
zrele(zp);
}
@@ -3653,8 +3655,8 @@ top:
* operation are sync safe.
*/
if (is_tmpfile) {
- VERIFY(zap_remove_int(zfsvfs->z_os,
- zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+ VERIFY0(zap_remove_int(zfsvfs->z_os,
+ zfsvfs->z_unlinkedobj, szp->z_id, tx));
} else {
if (flags & FIGNORECASE)
txtype |= TX_CI;
@@ -3669,18 +3671,20 @@ top:
zfs_dirent_unlock(dl);
- if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- txg_wait_flag_t wait_flags =
- spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
- ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
- error = txg_wait_synced_flags(dmu_objset_pool(zfsvfs->z_os),
- txg, wait_flags);
- if (error != 0) {
- ASSERT3U(error, ==, ESHUTDOWN);
- error = SET_ERROR(EIO);
+ if (error == 0) {
+ if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
+
+ if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ txg_wait_flag_t wait_flags =
+ spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
+ ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
+ error = txg_wait_synced_flags(
+ dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
+ if (error != 0) {
+ ASSERT3U(error, ==, ESHUTDOWN);
+ error = SET_ERROR(EIO);
+ }
}
}
@@ -3690,24 +3694,39 @@ top:
return (error);
}
-static void
-zfs_putpage_sync_commit_cb(void *arg)
+/* Finish page writeback. */
+static inline void
+zfs_page_writeback_done(struct page *pp, int err)
{
- struct page *pp = arg;
+ if (err != 0) {
+ /*
+ * Writeback failed. Re-dirty the page. It was undirtied before
+ * the IO was issued (in zfs_putpage() or write_cache_pages()).
+ * The kernel only considers writeback for dirty pages; if we
+ * don't do this, it is eligible for eviction without being
+ * written out, which we definitely don't want.
+ */
+#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+ filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+#else
+ __set_page_dirty_nobuffers(pp);
+#endif
+ }
ClearPageError(pp);
end_page_writeback(pp);
}
+/*
+ * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
+ * for syncing writes. Called when the ZIL itx has been written to the log or
+ * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
+ * is passed as `err`.
+ */
static void
-zfs_putpage_async_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg, int err)
{
- struct page *pp = arg;
- znode_t *zp = ITOZ(pp->mapping->host);
-
- ClearPageError(pp);
- end_page_writeback(pp);
- atomic_dec_32(&zp->z_async_writes_cnt);
+ zfs_page_writeback_done(arg, err);
}
/*
@@ -3827,15 +3846,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
zfs_rangelock_exit(lr);
if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Speed up any non-sync page writebacks since
- * they may take several seconds to complete.
- * Refer to the comment in zpl_fsync() for details.
- */
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- zil_commit(zfsvfs->z_log, zp->z_id);
- }
-
if (PageWriteback(pp))
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3861,8 +3871,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
* was in fact not skipped and should not be counted as if it were.
*/
wbc->pages_skipped--;
- if (!for_sync)
- atomic_inc_32(&zp->z_async_writes_cnt);
set_page_writeback(pp);
unlock_page(pp);
@@ -3874,18 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = dmu_tx_assign(tx, DMU_TX_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
-#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
- filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-#else
- __set_page_dirty_nobuffers(pp);
-#endif
- ClearPageError(pp);
- end_page_writeback(pp);
- if (!for_sync)
- atomic_dec_32(&zp->z_async_writes_cnt);
+ zfs_page_writeback_done(pp, err);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
- return (err);
+
+ /*
+ * Don't return error for an async writeback; we've re-dirtied
+ * the page so it will be tried again some other time.
+ */
+ return (for_sync ? err : 0);
}
va = kmap(pp);
@@ -3908,36 +3913,70 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- boolean_t commit = B_FALSE;
- if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Note that this is rarely called under writepages(), because
- * writepages() normally handles the entire commit for
- * performance reasons.
- */
- commit = B_TRUE;
- } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
+ /*
+ * A note about for_sync vs wbc->sync_mode.
+ *
+ * for_sync indicates that this is a syncing writeback, that is, kernel
+ * caller expects the data to be durably stored before being notified.
+ * Often, but not always, the call was triggered by a userspace syncing
+ * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+ * means that that page should remain "locked" (in the writeback state)
+ * until it is definitely on disk (ie zil_commit() or spa_sync()).
+ * Otherwise, we can unlock and return as soon as it is on the
+ * in-memory ZIL.
+ *
+ * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+ * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+ * indicates this a regular async writeback (eg a cache eviction) and
+ * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+ * a syncing op that must be waited on (by convention, we test for
+ * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+ * performance should there ever be a new mode that we have not yet
+ * added support for).
+ *
+ * So, why a separate for_sync field? This is because zpl_writepages()
+ * calls zfs_putpage() multiple times for a single "logical" operation.
+ * It wants all the individual pages to be for_sync==TRUE ie only
+ * unlocked once durably stored, but it only wants one call to
+ * zil_commit() at the very end, once all the pages are synced. So,
+ * it repurposes sync_mode slightly to indicate who issue and wait for
+ * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+ * ALL, zfs_putpage should do it.
+ *
+ * Summary:
+ * for_sync: 0=unlock immediately; 1=unlock once on disk
+ * sync_mode: NONE=caller will commit; ALL=we will commit
+ */
+ boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
+
+ /*
+ * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+ * because it is a policy flag that indicates "someone will call
+ * zil_commit() soon". for_sync=TRUE means exactly that; the only
+ * question is whether it will be us, or zpl_writepages().
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
+ B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
+
+ if (!for_sync) {
/*
- * If the caller does not intend to wait synchronously
- * for this page writeback to complete and there are active
- * synchronous calls on this file, do a commit so that
- * the latter don't accidentally end up waiting for
- * our writeback to complete. Refer to the comment in
- * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
+ * Async writeback is logged and written to the DMU, so page
+ * can now be unlocked.
*/
- commit = B_TRUE;
+ zfs_page_writeback_done(pp, 0);
}
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
- B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
- zfs_putpage_async_commit_cb, pp);
-
dmu_tx_commit(tx);
zfs_rangelock_exit(lr);
- if (commit)
- zil_commit(zfsvfs->z_log, zp->z_id);
+ if (need_commit) {
+ err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
+ if (err != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (err);
+ }
+ }
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);