aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/zil.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/zil.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c556
1 files changed, 478 insertions, 78 deletions
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 00059b2c6de0..31b59c55f17b 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -24,6 +24,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2018 Datto Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -103,6 +104,7 @@ static zil_kstat_values_t zil_stats = {
{ "zil_commit_error_count", KSTAT_DATA_UINT64 },
{ "zil_commit_stall_count", KSTAT_DATA_UINT64 },
{ "zil_commit_suspend_count", KSTAT_DATA_UINT64 },
+ { "zil_commit_crash_count", KSTAT_DATA_UINT64 },
{ "zil_itx_count", KSTAT_DATA_UINT64 },
{ "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
{ "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
@@ -145,7 +147,7 @@ static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache;
-static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
+static int zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
static itx_t *zil_itx_clone(itx_t *oitx);
static uint64_t zil_max_waste_space(zilog_t *zilog);
@@ -367,6 +369,7 @@ zil_sums_init(zil_sums_t *zs)
wmsum_init(&zs->zil_commit_error_count, 0);
wmsum_init(&zs->zil_commit_stall_count, 0);
wmsum_init(&zs->zil_commit_suspend_count, 0);
+ wmsum_init(&zs->zil_commit_crash_count, 0);
wmsum_init(&zs->zil_itx_count, 0);
wmsum_init(&zs->zil_itx_indirect_count, 0);
wmsum_init(&zs->zil_itx_indirect_bytes, 0);
@@ -392,6 +395,7 @@ zil_sums_fini(zil_sums_t *zs)
wmsum_fini(&zs->zil_commit_error_count);
wmsum_fini(&zs->zil_commit_stall_count);
wmsum_fini(&zs->zil_commit_suspend_count);
+ wmsum_fini(&zs->zil_commit_crash_count);
wmsum_fini(&zs->zil_itx_count);
wmsum_fini(&zs->zil_itx_indirect_count);
wmsum_fini(&zs->zil_itx_indirect_bytes);
@@ -422,6 +426,8 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
wmsum_value(&zil_sums->zil_commit_stall_count);
zs->zil_commit_suspend_count.value.ui64 =
wmsum_value(&zil_sums->zil_commit_suspend_count);
+ zs->zil_commit_crash_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_commit_crash_count);
zs->zil_itx_count.value.ui64 =
wmsum_value(&zil_sums->zil_itx_count);
zs->zil_itx_indirect_count.value.ui64 =
@@ -589,7 +595,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* that we rewind to is invalid. Thus, we return -1 so
* zil_parse() doesn't attempt to read it.
*/
- if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
+ if (BP_GET_BIRTH(bp) >= first_txg)
return (-1);
if (zil_bp_tree_add(zilog, bp) != 0)
@@ -615,7 +621,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
- if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
+ if (BP_IS_HOLE(bp) || BP_GET_BIRTH(bp) < first_txg ||
zil_bp_tree_add(zilog, bp) != 0)
return (0);
@@ -640,7 +646,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
- if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
+ if (BP_GET_BIRTH(&lr->lr_blkptr) >= first_txg) {
error = zil_read_log_data(zilog, lr, NULL);
if (error != 0)
return (error);
@@ -687,7 +693,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
* just in case lets be safe and just stop here now instead of
* corrupting the pool.
*/
- if (BP_GET_BIRTH(bp) >= first_txg)
+ if (BP_GET_PHYSICAL_BIRTH(bp) >= first_txg)
return (SET_ERROR(ENOENT));
/*
@@ -742,7 +748,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
/*
* If we previously claimed it, we need to free it.
*/
- if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+ if (BP_GET_BIRTH(bp) >= claim_txg &&
zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
}
@@ -864,9 +870,9 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
ASSERT(MUTEX_HELD(&zilog->zl_lock));
ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
lwb->lwb_state == LWB_STATE_FLUSH_DONE);
- ASSERT3P(lwb->lwb_child_zio, ==, NULL);
- ASSERT3P(lwb->lwb_write_zio, ==, NULL);
- ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+ ASSERT0P(lwb->lwb_child_zio);
+ ASSERT0P(lwb->lwb_write_zio);
+ ASSERT0P(lwb->lwb_root_zio);
ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
VERIFY(list_is_empty(&lwb->lwb_itxs));
@@ -991,8 +997,8 @@ zil_create(zilog_t *zilog)
*/
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
- ASSERT(zh->zh_claim_txg == 0);
- ASSERT(zh->zh_replay_seq == 0);
+ ASSERT0(zh->zh_claim_txg);
+ ASSERT0(zh->zh_replay_seq);
blk = zh->zh_log;
@@ -1104,7 +1110,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
zilog->zl_keep_first = keep_first;
if (!list_is_empty(&zilog->zl_lwb_list)) {
- ASSERT(zh->zh_claim_txg == 0);
+ ASSERT0(zh->zh_claim_txg);
VERIFY(!keep_first);
while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
if (lwb->lwb_buf != NULL)
@@ -1250,7 +1256,7 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
blkptr_t *bp;
int error;
- ASSERT(tx == NULL);
+ ASSERT0P(tx);
error = dmu_objset_from_ds(ds, &os);
if (error != 0) {
@@ -1351,7 +1357,7 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
ASSERT(!list_link_active(&zcw->zcw_node));
list_insert_tail(&lwb->lwb_waiters, zcw);
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT0P(zcw->zcw_lwb);
zcw->zcw_lwb = lwb;
}
@@ -1365,7 +1371,7 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
{
ASSERT(!list_link_active(&zcw->zcw_node));
list_insert_tail(nolwb, zcw);
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT0P(zcw->zcw_lwb);
}
void
@@ -1482,7 +1488,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
}
while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
@@ -1895,7 +1901,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
/*
* Finalize previously closed block and issue the write zio.
*/
-static void
+static int
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
{
spa_t *spa = zilog->zl_spa;
@@ -1909,8 +1915,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
/* Actually fill the lwb with the data. */
for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
- itx = list_next(&lwb->lwb_itxs, itx))
- zil_lwb_commit(zilog, lwb, itx);
+ itx = list_next(&lwb->lwb_itxs, itx)) {
+ error = zil_lwb_commit(zilog, lwb, itx);
+ if (error != 0) {
+ ASSERT3U(error, ==, ESHUTDOWN);
+ return (error);
+ }
+ }
lwb->lwb_nused = lwb->lwb_nfilled;
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
@@ -1928,7 +1939,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
lwb->lwb_state = LWB_STATE_READY;
if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
mutex_exit(&zilog->zl_lock);
- return;
+ return (0);
}
mutex_exit(&zilog->zl_lock);
@@ -1997,7 +2008,7 @@ next_lwb:
&slog);
}
if (error == 0) {
- ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
+ ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
ZIO_CHECKSUM_ZILOG);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -2065,6 +2076,8 @@ next_lwb:
lwb = nlwb;
if (lwb)
goto next_lwb;
+
+ return (0);
}
/*
@@ -2095,6 +2108,19 @@ zil_max_waste_space(zilog_t *zilog)
*/
static uint_t zil_maxcopied = 7680;
+/*
+ * Largest write size to store the data directly into ZIL.
+ */
+uint_t zfs_immediate_write_sz = 32768;
+
+/*
+ * When enabled and blocks go to normal vdev, treat special vdevs as SLOG,
+ * writing data to ZIL (WR_COPIED/WR_NEED_COPY). Disabling this forces the
+ * indirect writes (WR_INDIRECT) to preserve special vdev throughput and
+ * endurance, likely at the cost of normal vdev latency.
+ */
+int zil_special_is_slog = 1;
+
uint64_t
zil_max_copied_data(zilog_t *zilog)
{
@@ -2102,6 +2128,46 @@ zil_max_copied_data(zilog_t *zilog)
return (MIN(max_data, zil_maxcopied));
}
+/*
+ * Determine the appropriate write state for ZIL transactions based on
+ * pool configuration, data placement, write size, and logbias settings.
+ */
+itx_wr_state_t
+zil_write_state(zilog_t *zilog, uint64_t size, uint32_t blocksize,
+ boolean_t o_direct, boolean_t commit)
+{
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
+ return (WR_INDIRECT);
+
+ /*
+ * Don't use indirect for too small writes to reduce overhead.
+ * Don't use indirect if written less than a half of a block if
+ * we are going to commit it immediately, since next write might
+ * rewrite the same block again, causing inflation. If commit
+ * is not planned, then next writes might coalesce, and so the
+ * indirect may be perfect.
+ */
+ boolean_t indirect = (size >= zfs_immediate_write_sz &&
+ (size >= blocksize / 2 || !commit));
+
+ if (spa_has_slogs(zilog->zl_spa)) {
+ /* Dedicated slogs: never use indirect */
+ indirect = B_FALSE;
+ } else if (spa_has_special(zilog->zl_spa)) {
+ /* Special vdevs: only when beneficial */
+ boolean_t on_special = (blocksize <=
+ zilog->zl_os->os_zpl_special_smallblock);
+ indirect &= (on_special || !zil_special_is_slog);
+ }
+
+ if (indirect)
+ return (WR_INDIRECT);
+ else if (commit)
+ return (WR_COPIED);
+ else
+ return (WR_NEED_COPY);
+}
+
static uint64_t
zil_itx_record_size(itx_t *itx)
{
@@ -2255,11 +2321,13 @@ cont:
return (lwb);
}
+static void zil_crash(zilog_t *zilog);
+
/*
* Fill the actual transaction data into the lwb, following zil_lwb_assign().
* Does not require locking.
*/
-static void
+static int
zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
{
lr_t *lr, *lrb;
@@ -2271,7 +2339,7 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
lrw = (lr_write_t *)lr;
if (lr->lrc_txtype == TX_COMMIT)
- return;
+ return (0);
reclen = lr->lrc_reclen;
dlen = zil_itx_data_size(itx);
@@ -2357,16 +2425,35 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
". Falling back to txg_wait_synced().",
error);
zfs_fallthrough;
- case EIO:
- txg_wait_synced(zilog->zl_dmu_pool,
- lr->lrc_txg);
+ case EIO: {
+ int error = txg_wait_synced_flags(
+ zilog->zl_dmu_pool,
+ lr->lrc_txg, TXG_WAIT_SUSPEND);
+ if (error != 0) {
+ ASSERT3U(error, ==, ESHUTDOWN);
+ /*
+ * zil_lwb_commit() is called from a
+ * loop over a list of itxs at the
+ * top of zil_lwb_write_issue(), which
+ * itself is called from a loop over a
+ * list of lwbs in various places.
+ * zil_crash() will free those itxs
+ * and sometimes the lwbs, so they
+ * are invalid when zil_crash() returns.
+ * Callers must pretty much abort
+ * immediately.
+ */
+ zil_crash(zilog);
+ return (error);
+ }
zfs_fallthrough;
+ }
case ENOENT:
zfs_fallthrough;
case EEXIST:
zfs_fallthrough;
case EALREADY:
- return;
+ return (0);
}
}
}
@@ -2374,6 +2461,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
lwb->lwb_nfilled += reclen + dlen;
ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
+
+ return (0);
}
itx_t *
@@ -2415,7 +2504,7 @@ zil_itx_clone(itx_t *oitx)
}
void
-zil_itx_destroy(itx_t *itx)
+zil_itx_destroy(itx_t *itx, int err)
{
ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
ASSERT3U(itx->itx_lr.lrc_reclen, ==,
@@ -2424,7 +2513,7 @@ zil_itx_destroy(itx_t *itx)
IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
if (itx->itx_callback != NULL)
- itx->itx_callback(itx->itx_callback_data);
+ itx->itx_callback(itx->itx_callback_data, err);
zio_data_buf_free(itx, itx->itx_size);
}
@@ -2467,7 +2556,7 @@ zil_itxg_clean(void *arg)
if (itx->itx_lr.lrc_txtype == TX_COMMIT)
zil_commit_waiter_skip(itx->itx_private);
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
}
cookie = NULL;
@@ -2477,7 +2566,7 @@ zil_itxg_clean(void *arg)
while ((itx = list_remove_head(list)) != NULL) {
/* commit itxs should never be on the async lists. */
ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
}
list_destroy(list);
kmem_free(ian, sizeof (itx_async_node_t));
@@ -2539,7 +2628,7 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
while ((itx = list_remove_head(&clean_list)) != NULL) {
/* commit itxs should never be on the async lists. */
ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
}
list_destroy(&clean_list);
}
@@ -2624,6 +2713,67 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
}
/*
+ * Post-crash cleanup. This is called from zil_clean() because it needs to
+ * do cleanup after every txg until the ZIL is restarted, and zilog_dirty()
+ * can arrange that easily, unlike zil_sync() which is more complicated to
+ * get a call to without actual dirty data.
+ */
+static void
+zil_crash_clean(zilog_t *zilog, uint64_t synced_txg)
+{
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+ ASSERT3U(zilog->zl_restart_txg, >, 0);
+
+ /* Clean up anything on the crash list from earlier txgs */
+ lwb_t *lwb;
+ while ((lwb = list_head(&zilog->zl_lwb_crash_list)) != NULL) {
+ if (lwb->lwb_alloc_txg >= synced_txg ||
+ lwb->lwb_max_txg >= synced_txg) {
+ /*
+ * This lwb was allocated or updated on this txg, or
+ * in the future. We stop processing here, to avoid
+ * the strange situation of freeing a ZIL block on
+ * on the same or earlier txg than what it was
+ * allocated for.
+ *
+ * We'll take care of it on the next txg.
+ */
+ break;
+ }
+
+ /* This LWB is from the past, so we can clean it up now. */
+ list_remove(&zilog->zl_lwb_crash_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ if (!BP_IS_HOLE(&lwb->lwb_blk))
+ /*
+ * Free on the next txg, since zil_clean() is called
+ * once synced_txg has already been completed.
+ */
+ zio_free(zilog->zl_spa, synced_txg+1, &lwb->lwb_blk);
+ zil_free_lwb(zilog, lwb);
+ }
+
+ if (zilog->zl_restart_txg > synced_txg) {
+ /*
+ * Not reached the restart txg yet, so mark the ZIL dirty for
+ * the next txg and we'll consider it all again then.
+ */
+ zilog_dirty(zilog, synced_txg+1);
+ return;
+ }
+
+ /*
+ * Reached the restart txg, so we can allow new calls to zil_commit().
+ * All ZIL txgs have long past so there should be no IO waiting.
+ */
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ ASSERT(list_is_empty(&zilog->zl_lwb_crash_list));
+
+ zilog->zl_restart_txg = 0;
+}
+
+/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them. We should only do this after we
* have written out the uberblocks (i.e. txg has been committed) so that
@@ -2638,6 +2788,15 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
ASSERT3U(synced_txg, <, ZILTEST_TXG);
+ /* Do cleanup and restart after crash. */
+ if (zilog->zl_restart_txg > 0) {
+ mutex_enter(&zilog->zl_lock);
+ /* Make sure we didn't lose a race. */
+ if (zilog->zl_restart_txg > 0)
+ zil_crash_clean(zilog, synced_txg);
+ mutex_exit(&zilog->zl_lock);
+ }
+
mutex_enter(&itxg->itxg_lock);
if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
mutex_exit(&itxg->itxg_lock);
@@ -2830,13 +2989,13 @@ zil_prune_commit_list(zilog_t *zilog)
mutex_exit(&zilog->zl_lock);
list_remove(&zilog->zl_itx_commit_list, itx);
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
}
IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
}
-static void
+static int
zil_commit_writer_stall(zilog_t *zilog)
{
/*
@@ -2861,8 +3020,22 @@ zil_commit_writer_stall(zilog_t *zilog)
*/
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ZIL_STAT_BUMP(zilog, zil_commit_stall_count);
- txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ int err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0,
+ TXG_WAIT_SUSPEND);
+ if (err != 0) {
+ ASSERT3U(err, ==, ESHUTDOWN);
+ zil_crash(zilog);
+ }
+
+ /*
+ * Either zil_sync() has been called to wait for and clean up any
+ * in-flight LWBs, or zil_crash() has emptied out the list and arranged
+ * for them to be cleaned up later.
+ */
ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
+ return (err);
}
static void
@@ -2902,19 +3075,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
- /*
- * Return if there's nothing to commit before we dirty the fs by
- * calling zil_create().
- */
- if (list_is_empty(&zilog->zl_itx_commit_list))
- return;
-
- list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
- list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
- offsetof(zil_commit_waiter_t, zcw_node));
-
lwb = list_tail(&zilog->zl_lwb_list);
if (lwb == NULL) {
+ /*
+ * Return if there's nothing to commit before we dirty the fs.
+ */
+ if (list_is_empty(&zilog->zl_itx_commit_list))
+ return;
+
lwb = zil_create(zilog);
} else {
/*
@@ -2942,6 +3110,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
}
}
+ list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+ list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+ offsetof(zil_commit_waiter_t, zcw_node));
+
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
lr_t *lrc = &itx->itx_lr;
uint64_t txg = lrc->lrc_txg;
@@ -3030,7 +3202,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
} else {
ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
zilog->zl_cur_left -= zil_itx_full_size(itx);
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
}
}
@@ -3041,9 +3213,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
*/
- while ((lwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, lwb);
- zil_commit_writer_stall(zilog);
+ int err = 0;
+ while ((lwb = list_remove_head(ilwbs)) != NULL) {
+ err = zil_lwb_write_issue(zilog, lwb);
+ if (err != 0)
+ break;
+ }
+ if (err == 0)
+ err = zil_commit_writer_stall(zilog);
/*
* Additionally, we have to signal and mark the "nolwb"
@@ -3061,7 +3238,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* the itx's callback if one exists for the itx.
*/
while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
} else {
ASSERT(list_is_empty(&nolwb_waiters));
ASSERT3P(lwb, !=, NULL);
@@ -3111,14 +3288,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* possible, without significantly impacting the latency
* of each individual itx.
*/
- if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+ if (lwb->lwb_state == LWB_STATE_OPENED &&
+ (!zilog->zl_parallel || zilog->zl_suspend > 0)) {
zil_burst_done(zilog);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
if (lwb == NULL) {
- while ((lwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, lwb);
- zil_commit_writer_stall(zilog);
+ int err = 0;
+ while ((lwb =
+ list_remove_head(ilwbs)) != NULL) {
+ err = zil_lwb_write_issue(zilog, lwb);
+ if (err != 0)
+ break;
+ }
+ if (err == 0)
+ zil_commit_writer_stall(zilog);
}
}
}
@@ -3177,10 +3361,23 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
zil_prune_commit_list(zilog);
zil_process_commit_list(zilog, zcw, &ilwbs);
+ /*
+ * If the ZIL failed somewhere inside zil_process_commit_list(), it's
+ * will be because a fallback to txg_wait_sync_flags() happened at some
+ * point (eg zil_commit_writer_stall()). All cases should issue and
+ * empty ilwbs, so there will be nothing to in the issue loop below.
+ * That's why we don't have to plumb the error value back from
+ * zil_process_commit_list(), and don't have to skip it.
+ */
+ IMPLY(zilog->zl_restart_txg > 0, list_is_empty(&ilwbs));
+
out:
mutex_exit(&zilog->zl_issuer_lock);
- while ((lwb = list_remove_head(&ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, lwb);
+ int err = 0;
+ while ((lwb = list_remove_head(&ilwbs)) != NULL) {
+ if (err == 0)
+ err = zil_lwb_write_issue(zilog, lwb);
+ }
list_destroy(&ilwbs);
return (wtxg);
}
@@ -3436,7 +3633,7 @@ static void
zil_free_commit_waiter(zil_commit_waiter_t *zcw)
{
ASSERT(!list_link_active(&zcw->zcw_node));
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT0P(zcw->zcw_lwb);
ASSERT3B(zcw->zcw_done, ==, B_TRUE);
mutex_destroy(&zcw->zcw_lock);
cv_destroy(&zcw->zcw_cv);
@@ -3473,6 +3670,96 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
}
/*
+ * Crash the ZIL. This is something like suspending, but abandons the ZIL
+ * without further IO until the wanted txg completes. No effort is made to
+ * close the on-disk chain or do any other on-disk work, as the pool may
+ * have suspended. zil_sync() will handle cleanup as normal and restart the
+ * ZIL once enough txgs have passed.
+ */
+static void
+zil_crash(zilog_t *zilog)
+{
+ mutex_enter(&zilog->zl_lock);
+
+ uint64_t txg = spa_syncing_txg(zilog->zl_spa);
+ uint64_t restart_txg =
+ spa_syncing_txg(zilog->zl_spa) + TXG_CONCURRENT_STATES;
+
+ if (zilog->zl_restart_txg > 0) {
+ /*
+ * If the ZIL is already crashed, it's almost certainly because
+ * we lost a race involving multiple callers from
+ * zil_commit_impl().
+ */
+
+ /*
+ * This sanity check is to support my understanding that in the
+ * event of multiple callers to zil_crash(), only one of them
+ * can possibly be in the codepath to issue lwbs; the rest
+ * should be calling from zil_commit_impl() after their waiters
+ * have completed. As I understand it, a second thread trying
+ * to issue will eventually wait on zl_issuer_lock, and then
+ * have no work to do and leave.
+ *
+ * If more lwbs had been created an issued between zil_crash()
+ * calls, then we probably just need to take those too, add
+ * them to the crash list and clean them up, but it complicates
+ * this function and I don't think it can happend.
+ */
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+
+ zilog->zl_restart_txg = restart_txg;
+
+ /*
+ * Capture any live LWBs. Depending on the state of the pool they may
+ * represent in-flight IO that won't return for some time, and we want
+ * to make sure they don't get in the way of normal ZIL operation.
+ */
+ ASSERT(list_is_empty(&zilog->zl_lwb_crash_list));
+ list_move_tail(&zilog->zl_lwb_crash_list, &zilog->zl_lwb_list);
+
+ /*
+ * Run through the LWB list; erroring all itxes and signalling error
+ * to all waiters.
+ */
+ for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL;
+ lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) {
+ itx_t *itx;
+ while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
+ zil_itx_destroy(itx, EIO);
+
+ zil_commit_waiter_t *zcw;
+ while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
+ mutex_enter(&zcw->zcw_lock);
+ zcw->zcw_lwb = NULL;
+ zcw->zcw_zio_error = EIO;
+ zcw->zcw_done = B_TRUE;
+ cv_broadcast(&zcw->zcw_cv);
+ mutex_exit(&zcw->zcw_lock);
+ }
+ }
+
+ /*
+ * Zero the ZIL header bp after the ZIL restarts. We'll free it in
+ * zil_clean() when we clean up the lwbs.
+ */
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ BP_ZERO(&zh->zh_log);
+
+ /*
+ * Mark this ZIL dirty on the next txg, so that zil_clean() will be
+ * called for cleanup.
+ */
+ zilog_dirty(zilog, txg+1);
+
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
* Commit ZFS Intent Log transactions (itxs) to stable storage.
*
* When writing ZIL transactions to the on-disk representation of the
@@ -3587,9 +3874,17 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
* but the order in which they complete will be the same order in
* which they were created.
*/
-void
+static int zil_commit_impl(zilog_t *zilog, uint64_t foid);
+
+int
zil_commit(zilog_t *zilog, uint64_t foid)
{
+ return (zil_commit_flags(zilog, foid, ZIL_COMMIT_FAILMODE));
+}
+
+int
+zil_commit_flags(zilog_t *zilog, uint64_t foid, zil_commit_flag_t flags)
+{
/*
* We should never attempt to call zil_commit on a snapshot for
* a couple of reasons:
@@ -3606,7 +3901,7 @@ zil_commit(zilog_t *zilog, uint64_t foid)
ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
if (zilog->zl_sync == ZFS_SYNC_DISABLED)
- return;
+ return (0);
if (!spa_writeable(zilog->zl_spa)) {
/*
@@ -3617,10 +3912,23 @@ zil_commit(zilog_t *zilog, uint64_t foid)
* verifying that truth before we return to the caller.
*/
ASSERT(list_is_empty(&zilog->zl_lwb_list));
- ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+ ASSERT0P(zilog->zl_last_lwb_opened);
for (int i = 0; i < TXG_SIZE; i++)
- ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
- return;
+ ASSERT0P(zilog->zl_itxg[i].itxg_itxs);
+ return (0);
+ }
+
+ int err = 0;
+
+ /*
+ * If the ZIL crashed, bypass it entirely, and rely on txg_wait_sync()
+ * to get the data out to disk.
+ */
+ if (zilog->zl_restart_txg > 0) {
+ ZIL_STAT_BUMP(zilog, zil_commit_crash_count);
+ err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0,
+ TXG_WAIT_SUSPEND);
+ goto out;
}
/*
@@ -3632,14 +3940,43 @@ zil_commit(zilog_t *zilog, uint64_t foid)
*/
if (zilog->zl_suspend > 0) {
ZIL_STAT_BUMP(zilog, zil_commit_suspend_count);
- txg_wait_synced(zilog->zl_dmu_pool, 0);
- return;
+ err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0,
+ TXG_WAIT_SUSPEND);
+ if (err != 0) {
+ ASSERT3U(err, ==, ESHUTDOWN);
+ zil_crash(zilog);
+ }
+ goto out;
}
- zil_commit_impl(zilog, foid);
+ err = zil_commit_impl(zilog, foid);
+
+out:
+ if (err == 0)
+ return (0);
+
+ /*
+ * The ZIL write failed and the pool is suspended. There's nothing else
+ * we can do except return or block.
+ */
+ ASSERT3U(err, ==, ESHUTDOWN);
+
+ /*
+ * Return error if failmode=continue or caller will handle directly.
+ */
+ if (!(flags & ZIL_COMMIT_FAILMODE) ||
+ spa_get_failmode(zilog->zl_spa) == ZIO_FAILURE_MODE_CONTINUE)
+ return (SET_ERROR(EIO));
+
+ /*
+ * Block until the pool returns. We assume that the data will make
+ * it out to disk in the end, and so return success.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ return (0);
}
-void
+static int
zil_commit_impl(zilog_t *zilog, uint64_t foid)
{
ZIL_STAT_BUMP(zilog, zil_commit_count);
@@ -3676,6 +4013,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
uint64_t wtxg = zil_commit_writer(zilog, zcw);
zil_commit_waiter(zilog, zcw);
+ int err = 0;
if (zcw->zcw_zio_error != 0) {
/*
* If there was an error writing out the ZIL blocks that
@@ -3688,13 +4026,29 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
ZIL_STAT_BUMP(zilog, zil_commit_error_count);
DTRACE_PROBE2(zil__commit__io__error,
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
- txg_wait_synced(zilog->zl_dmu_pool, 0);
+ err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0,
+ TXG_WAIT_SUSPEND);
} else if (wtxg != 0) {
ZIL_STAT_BUMP(zilog, zil_commit_suspend_count);
- txg_wait_synced(zilog->zl_dmu_pool, wtxg);
+ err = txg_wait_synced_flags(zilog->zl_dmu_pool, wtxg,
+ TXG_WAIT_SUSPEND);
}
zil_free_commit_waiter(zcw);
+
+ if (err == 0)
+ return (0);
+
+ /*
+ * ZIL write failed and pool failed in the fallback to
+ * txg_wait_synced_flags(). Right now we have no idea if the data is on
+ * disk and the pool is probably suspended so we have no idea when it's
+ * coming back. All we can do is shut down and return error to the
+ * caller.
+ */
+ ASSERT3U(err, ==, ESHUTDOWN);
+ zil_crash(zilog);
+ return (err);
}
/*
@@ -3720,7 +4074,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
mutex_enter(&zilog->zl_lock);
- ASSERT(zilog->zl_stop_sync == 0);
+ ASSERT0(zilog->zl_stop_sync);
if (*replayed_seq != 0) {
ASSERT(zh->zh_replay_seq < *replayed_seq);
@@ -3890,6 +4244,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
offsetof(lwb_t, lwb_node));
+ list_create(&zilog->zl_lwb_crash_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
offsetof(itx_t, itx_node));
@@ -3914,9 +4270,12 @@ zil_free(zilog_t *zilog)
ASSERT0(zilog->zl_suspend);
ASSERT0(zilog->zl_suspending);
+ ASSERT0(zilog->zl_restart_txg);
ASSERT(list_is_empty(&zilog->zl_lwb_list));
list_destroy(&zilog->zl_lwb_list);
+ ASSERT(list_is_empty(&zilog->zl_lwb_crash_list));
+ list_destroy(&zilog->zl_lwb_crash_list);
ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
list_destroy(&zilog->zl_itx_commit_list);
@@ -3952,8 +4311,8 @@ zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
{
zilog_t *zilog = dmu_objset_zil(os);
- ASSERT3P(zilog->zl_get_data, ==, NULL);
- ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+ ASSERT0P(zilog->zl_get_data);
+ ASSERT0P(zilog->zl_last_lwb_opened);
ASSERT(list_is_empty(&zilog->zl_lwb_list));
zilog->zl_get_data = get_data;
@@ -3972,7 +4331,8 @@ zil_close(zilog_t *zilog)
uint64_t txg;
if (!dmu_objset_is_snapshot(zilog->zl_os)) {
- zil_commit(zilog, 0);
+ if (zil_commit_flags(zilog, 0, ZIL_COMMIT_NOW) != 0)
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
} else {
ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT0(zilog->zl_dirty_max_txg);
@@ -4073,6 +4433,17 @@ zil_suspend(const char *osname, void **cookiep)
return (SET_ERROR(EBUSY));
}
+ if (zilog->zl_restart_txg > 0) {
+ /*
+ * ZIL crashed. It effectively _is_ suspended, but callers
+ * are usually trying to make sure it's empty on-disk, which
+ * we can't guarantee right now.
+ */
+ mutex_exit(&zilog->zl_lock);
+ dmu_objset_rele(os, suspend_tag);
+ return (SET_ERROR(EBUSY));
+ }
+
/*
* Don't put a long hold in the cases where we can avoid it. This
* is when there is no cookie so we are doing a suspend & resume
@@ -4105,6 +4476,11 @@ zil_suspend(const char *osname, void **cookiep)
zil_resume(os);
else
*cookiep = os;
+
+ if (zilog->zl_restart_txg > 0)
+ /* ZIL crashed while we were waiting. */
+ return (SET_ERROR(EBUSY));
+
return (0);
}
@@ -4146,17 +4522,34 @@ zil_suspend(const char *osname, void **cookiep)
* would just call txg_wait_synced(), because zl_suspend is set.
* txg_wait_synced() doesn't wait for these lwb's to be
* LWB_STATE_FLUSH_DONE before returning.
+ *
+ * However, zil_commit_impl() itself can return an error if any of the
+ * lwbs fail, or the pool suspends in the fallback
+ * txg_wait_sync_flushed(), which affects what we do next, so we
+ * capture that error.
*/
- zil_commit_impl(zilog, 0);
+ error = zil_commit_impl(zilog, 0);
+ if (error == ESHUTDOWN)
+ /* zil_commit_impl() has called zil_crash() already */
+ error = SET_ERROR(EBUSY);
/*
* Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
* use txg_wait_synced() to ensure the data from the zilog has
* migrated to the main pool before calling zil_destroy().
*/
- txg_wait_synced(zilog->zl_dmu_pool, 0);
+ if (error == 0) {
+ error = txg_wait_synced_flags(zilog->zl_dmu_pool, 0,
+ TXG_WAIT_SUSPEND);
+ if (error != 0) {
+ ASSERT3U(error, ==, ESHUTDOWN);
+ zil_crash(zilog);
+ error = SET_ERROR(EBUSY);
+ }
+ }
- zil_destroy(zilog, B_FALSE);
+ if (error == 0)
+ zil_destroy(zilog, B_FALSE);
mutex_enter(&zilog->zl_lock);
zilog->zl_suspending = B_FALSE;
@@ -4170,7 +4563,8 @@ zil_suspend(const char *osname, void **cookiep)
zil_resume(os);
else
*cookiep = os;
- return (0);
+
+ return (error);
}
void
@@ -4333,7 +4727,7 @@ zil_replay(objset_t *os, void *arg,
zilog->zl_replay = B_TRUE;
zilog->zl_replay_time = ddi_get_lbolt();
- ASSERT(zilog->zl_replay_blks == 0);
+ ASSERT0(zilog->zl_replay_blks);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg, B_TRUE);
vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
@@ -4418,3 +4812,9 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
"Limit in bytes WR_COPIED size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW,
+ "Largest write size to store data into ZIL");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, special_is_slog, INT, ZMOD_RW,
+ "Treat special vdevs as SLOG");