diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/zil.c')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/zil.c | 780 |
1 files changed, 598 insertions, 182 deletions
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 6e4f84257407..0307df55aa21 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -24,6 +24,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -103,6 +104,7 @@ static zil_kstat_values_t zil_stats = { { "zil_commit_error_count", KSTAT_DATA_UINT64 }, { "zil_commit_stall_count", KSTAT_DATA_UINT64 }, { "zil_commit_suspend_count", KSTAT_DATA_UINT64 }, + { "zil_commit_crash_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, @@ -145,7 +147,7 @@ static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; -static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static int zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); static uint64_t zil_max_waste_space(zilog_t *zilog); @@ -367,6 +369,7 @@ zil_sums_init(zil_sums_t *zs) wmsum_init(&zs->zil_commit_error_count, 0); wmsum_init(&zs->zil_commit_stall_count, 0); wmsum_init(&zs->zil_commit_suspend_count, 0); + wmsum_init(&zs->zil_commit_crash_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); @@ -392,6 +395,7 @@ zil_sums_fini(zil_sums_t *zs) wmsum_fini(&zs->zil_commit_error_count); wmsum_fini(&zs->zil_commit_stall_count); wmsum_fini(&zs->zil_commit_suspend_count); + wmsum_fini(&zs->zil_commit_crash_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); @@ -422,6 +426,8 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) wmsum_value(&zil_sums->zil_commit_stall_count); zs->zil_commit_suspend_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_suspend_count); + zs->zil_commit_crash_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_crash_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = @@ -813,34 +819,37 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) * we choose them here and later make the block allocation match. */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, - uint64_t txg, lwb_state_t state) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz, + boolean_t slog, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_flags = 0; lwb->lwb_zilog = zilog; if (bp) { lwb->lwb_blk = *bp; - lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) + lwb->lwb_flags |= LWB_FLAG_SLIM; sz = BP_GET_LSIZE(bp); + lwb->lwb_min_sz = sz; } else { BP_ZERO(&lwb->lwb_blk); - lwb->lwb_slim = (spa_version(zilog->zl_spa) >= - SPA_VERSION_SLIM_ZIL); + if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL) + lwb->lwb_flags |= LWB_FLAG_SLIM; + lwb->lwb_min_sz = min_sz; } - lwb->lwb_slog = slog; + if (slog) + lwb->lwb_flags |= LWB_FLAG_SLOG; lwb->lwb_error = 0; - if (lwb->lwb_slim) { - lwb->lwb_nmax = sz; - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - } else { - lwb->lwb_nmax = sz - sizeof (zil_chain_t); - lwb->lwb_nused = lwb->lwb_nfilled = 0; - } + /* + * Buffer allocation and capacity setup will be done in + * zil_lwb_write_open() when the LWB is opened for ITX assignment. + */ + lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0; lwb->lwb_sz = sz; - lwb->lwb_state = state; - lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_buf = NULL; + lwb->lwb_state = LWB_STATE_NEW; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; @@ -851,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); - if (state != LWB_STATE_NEW) - zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); @@ -864,15 +871,15 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); - ASSERT3P(lwb->lwb_child_zio, ==, NULL); - ASSERT3P(lwb->lwb_write_zio, ==, NULL); - ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT0P(lwb->lwb_child_zio); + ASSERT0P(lwb->lwb_write_zio); + ASSERT0P(lwb->lwb_root_zio); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_lock)); /* * Clear the zilog's field to indicate this lwb is no longer @@ -991,8 +998,8 @@ zil_create(zilog_t *zilog) */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - ASSERT(zh->zh_claim_txg == 0); - ASSERT(zh->zh_replay_seq == 0); + ASSERT0(zh->zh_claim_txg); + ASSERT0(zh->zh_replay_seq); blk = zh->zh_log; @@ -1013,7 +1020,7 @@ zil_create(zilog_t *zilog) } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, - ZIL_MIN_BLKSZ, &slog); + ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE); if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -1022,7 +1029,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); + lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg); /* * If we just allocated the first log block, commit our transaction @@ -1104,7 +1111,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { - ASSERT(zh->zh_claim_txg == 0); + ASSERT0(zh->zh_claim_txg); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) @@ -1250,7 +1257,7 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) blkptr_t *bp; int error; - ASSERT(tx == NULL); + ASSERT0P(tx); error = dmu_objset_from_ds(ds, &os); if (error != 0) { @@ -1318,10 +1325,12 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) * zil_commit() is racing with spa_sync(). */ static void -zil_commit_waiter_skip(zil_commit_waiter_t *zcw) +zil_commit_waiter_done(zil_commit_waiter_t *zcw, int err) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_lwb = NULL; + zcw->zcw_error = err; zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); @@ -1351,7 +1360,7 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(&lwb->lwb_waiters, zcw); - ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT0P(zcw->zcw_lwb); zcw->zcw_lwb = lwb; } @@ -1365,7 +1374,7 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(nolwb, zcw); - ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT0P(zcw->zcw_lwb); } void @@ -1383,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) if (zil_nocacheflush) return; - mutex_enter(&lwb->lwb_vdev_lock); + mutex_enter(&lwb->lwb_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { @@ -1392,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) avl_insert(t, zv, where); } } - mutex_exit(&lwb->lwb_vdev_lock); + mutex_exit(&lwb->lwb_lock); } static void @@ -1409,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified + * not need the protection of lwb_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ - mutex_enter(&nlwb->lwb_vdev_lock); + mutex_enter(&nlwb->lwb_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. @@ -1428,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) kmem_free(zv, sizeof (*zv)); } } - mutex_exit(&nlwb->lwb_vdev_lock); + mutex_exit(&nlwb->lwb_lock); } void @@ -1482,13 +1491,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) } while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { - mutex_enter(&zcw->zcw_lock); - - ASSERT3P(zcw->zcw_lwb, ==, lwb); - zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been * propagated "up" to this specific LWB's root ZIO, in @@ -1503,14 +1508,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) * errors not being handled correctly here. See the * comment above the call to "zio_flush" for details. */ - - zcw->zcw_zio_error = zio->io_error; - - ASSERT3B(zcw->zcw_done, ==, B_FALSE); - zcw->zcw_done = B_TRUE; - cv_broadcast(&zcw->zcw_cv); - - mutex_exit(&zcw->zcw_lock); + zil_commit_waiter_done(zcw, zio->io_error); } uint64_t txg = lwb->lwb_issued_txg; @@ -1582,7 +1580,7 @@ zil_lwb_write_done(zio_t *zio) avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; - lwb_t *nlwb; + lwb_t *nlwb = NULL; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); @@ -1602,9 +1600,11 @@ zil_lwb_write_done(zio_t *zio) * its write ZIO a parent this ZIO. In such case we can not defer * our flushes or below may be a race between the done callbacks. */ - nlwb = list_next(&zilog->zl_lwb_list, lwb); - if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) - nlwb = NULL; + if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) { + nlwb = list_next(&zilog->zl_lwb_list, lwb); + if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) + nlwb = NULL; + } mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) @@ -1618,12 +1618,17 @@ zil_lwb_write_done(zio_t *zio) * written out. * * Additionally, we don't perform any further error handling at - * this point (e.g. setting "zcw_zio_error" appropriately), as - * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, - * we expect any error seen here, to have been propagated to - * that function). + * this point (e.g. setting "zcw_error" appropriately), as we + * expect that to occur in "zil_lwb_flush_vdevs_done" (thus, we + * expect any error seen here, to have been propagated to that + * function). + * + * Note that we treat a "crashed" LWB as though it was in error, + * even if it did appear to succeed, because we've already + * signaled error and cleaned up waiters and committers in + * zil_crash(); we just want to clean up and get out of here. */ - if (zio->io_error != 0) { + if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; @@ -1736,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) return; } + mutex_enter(&lwb->lwb_lock); mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); + mutex_exit(&lwb->lwb_lock); + + /* + * Allocate buffer and set up LWB capacities. + */ + ASSERT0P(lwb->lwb_buf); + ASSERT3U(lwb->lwb_sz, >, 0); + lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); + if (lwb->lwb_flags & LWB_FLAG_SLIM) { + lwb->lwb_nmax = lwb->lwb_sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } } /* @@ -1756,6 +1777,8 @@ static uint_t zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) { uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); if (size <= md) { /* @@ -1766,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) } else if (size > 8 * md) { /* * Big bursts use maximum blocks. The first block size - * is hard to predict, but it does not really matter. + * is hard to predict, but we need at least enough space + * to make reasonable progress. */ - *minsize = 0; + *minsize = waste; return (md); } @@ -1781,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) uint_t s = size; uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); uint_t chunk = DIV_ROUND_UP(s, n); - uint_t waste = zil_max_waste_space(zilog); - waste = MAX(waste, zilog->zl_cur_max); if (chunk <= md - waste) { *minsize = MAX(s - (md - waste) * (n - 1), waste); return (chunk); } else { - *minsize = 0; + *minsize = waste; return (md); } } /* * Try to predict next block size based on previous history. Make prediction - * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is - * less then 50%, extra writes may cost more, but we don't want single spike - * to badly affect our predictions. + * sufficient for 7 of 8 previous bursts, but don't try to save if the saving + * is less then 50%. Extra writes may cost more, but we don't want single + * spike to badly affect our predictions. */ -static uint_t -zil_lwb_predict(zilog_t *zilog) +static void +zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict) { - uint_t m, o; + uint_t m1 = 0, m2 = 0, o; - /* If we are in the middle of a burst, take it into account also. */ - if (zilog->zl_cur_size > 0) { - o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); - } else { + /* If we are in the middle of a burst, take it as another data point. */ + if (zilog->zl_cur_size > 0) + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1); + else o = UINT_MAX; - m = 0; - } - /* Find minimum optimal size. We don't need to go below that. */ - for (int i = 0; i < ZIL_BURSTS; i++) - o = MIN(o, zilog->zl_prev_opt[i]); - - /* Find two biggest minimal first block sizes above the optimal. */ - uint_t m1 = MAX(m, o), m2 = o; + /* Find two largest minimal first block sizes. */ for (int i = 0; i < ZIL_BURSTS; i++) { - m = zilog->zl_prev_min[i]; - if (m >= m1) { + uint_t cur = zilog->zl_prev_min[i]; + if (cur >= m1) { m2 = m1; - m1 = m; - } else if (m > m2) { - m2 = m; + m1 = cur; + } else if (cur > m2) { + m2 = cur; } } - /* - * If second minimum size gives 50% saving -- use it. It may cost us - * one additional write later, but the space saving is just too big. - */ - return ((m1 < m2 * 2) ? m1 : m2); + /* Minimum should guarantee progress in most cases. */ + *min_predict = (m1 < m2 * 2) ? m1 : m2; + + /* Maximum doesn't need to go below the minimum optimal size. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + m1 = MAX(m1, o); + m2 = MAX(m2, o); + *max_predict = (m1 < m2 * 2) ? m1 : m2; } /* @@ -1839,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog) * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { - uint64_t blksz, plan, plan2; + uint64_t minbs, maxbs; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + membar_producer(); lwb->lwb_state = LWB_STATE_CLOSED; /* @@ -1869,33 +1889,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) * Try to predict what can it be and plan for the worst case. */ uint_t m; - plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + minbs = m; if (zilog->zl_parallel) { - plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + - zil_lwb_predict(zilog), &m); - if (plan < plan2) - plan = plan2; + uint64_t minp, maxp; + zil_lwb_predict(zilog, &minp, &maxp); + maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp, + &m); + if (maxbs < maxp) + maxbs = maxp; } } else { /* * The previous burst is done and we can only predict what * will come next. */ - plan = zil_lwb_predict(zilog); + zil_lwb_predict(zilog, &minbs, &maxbs); } - blksz = plan + sizeof (zil_chain_t); - blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); - blksz = MIN(blksz, zilog->zl_max_block_size); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, - uint64_t, plan); - return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); + minbs += sizeof (zil_chain_t); + maxbs += sizeof (zil_chain_t); + minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t); + maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t); + maxbs = MIN(maxbs, zilog->zl_max_block_size); + minbs = MIN(minbs, maxbs); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs, + uint64_t, maxbs); + + return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0)); } /* * Finalize previously closed block and issue the write zio. */ -static void +static int zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { spa_t *spa = zilog->zl_spa; @@ -1909,8 +1936,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) /* Actually fill the lwb with the data. */ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); + itx = list_next(&lwb->lwb_itxs, itx)) { + error = zil_lwb_commit(zilog, lwb, itx); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + return (error); + } + } lwb->lwb_nused = lwb->lwb_nfilled; ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); @@ -1928,19 +1960,21 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) lwb->lwb_state = LWB_STATE_READY; if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { mutex_exit(&zilog->zl_lock); - return; + return (0); } mutex_exit(&zilog->zl_lock); next_lwb: - if (lwb->lwb_slim) + if (lwb->lwb_flags & LWB_FLAG_SLIM) zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); - int wsz = lwb->lwb_sz; + uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk); + int wsz = alloc_size; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); - if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk) + if (!(lwb->lwb_flags & LWB_FLAG_SLOG) || + zilog->zl_cur_size <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; @@ -1948,16 +1982,17 @@ next_lwb: ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, - &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + &lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); - if (lwb->lwb_slim) { + if (lwb->lwb_flags & LWB_FLAG_SLIM) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); + ASSERT3S(wsz, <=, alloc_size); + if (wsz < alloc_size) + zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); @@ -1993,13 +2028,53 @@ next_lwb: BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, - &slog); + /* + * Allocation flexibility depends on LWB state: + * if NEW: allow range allocation and larger sizes; + * if OPENED: use fixed predetermined allocation size; + * if CLOSED + Slim: allocate precisely for actual usage. + */ + boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW); + if (flexible) { + /* We need to prevent opening till we update lwb_sz. */ + mutex_enter(&nlwb->lwb_lock); + flexible = (nlwb->lwb_state == LWB_STATE_NEW); + if (!flexible) + mutex_exit(&nlwb->lwb_lock); /* We lost. */ + } + boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED && + (lwb->lwb_flags & LWB_FLAG_SLIM)); + + uint64_t min_size, max_size; + if (closed_slim) { + /* This transition is racy, but only one way. */ + membar_consumer(); + min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused, + ZIL_MIN_BLKSZ, uint64_t); + } else if (flexible) { + min_size = nlwb->lwb_min_sz; + max_size = nlwb->lwb_sz; + } else { + min_size = max_size = nlwb->lwb_sz; + } + + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, + min_size, max_size, &slog, flexible); + if (error == 0) { + if (closed_slim) + ASSERT3U(BP_GET_LSIZE(bp), ==, max_size); + else if (flexible) + nlwb->lwb_sz = BP_GET_LSIZE(bp); + else + ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz); + } + if (flexible) + mutex_exit(&nlwb->lwb_lock); } if (error == 0) { ASSERT3U(BP_GET_BIRTH(bp), ==, txg); - BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : - ZIO_CHECKSUM_ZILOG); + BP_SET_CHECKSUM(bp, (nlwb->lwb_flags & LWB_FLAG_SLIM) ? + ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; } @@ -2028,14 +2103,15 @@ next_lwb: if (nlwb) { nlwb->lwb_blk = *bp; nlwb->lwb_error = error; - nlwb->lwb_slog = slog; + if (slog) + nlwb->lwb_flags |= LWB_FLAG_SLOG; nlwb->lwb_alloc_txg = txg; if (nlwb->lwb_state != LWB_STATE_READY) nlwb = NULL; } mutex_exit(&zilog->zl_lock); - if (lwb->lwb_slog) { + if (lwb->lwb_flags & LWB_FLAG_SLOG) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); @@ -2065,6 +2141,8 @@ next_lwb: lwb = nlwb; if (lwb) goto next_lwb; + + return (0); } /* @@ -2207,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); - ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); @@ -2249,9 +2326,10 @@ cont: (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); + zil_lwb_write_open(zilog, lwb); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; } @@ -2308,11 +2386,13 @@ cont: return (lwb); } +static void zil_crash(zilog_t *zilog); + /* * Fill the actual transaction data into the lwb, following zil_lwb_assign(). * Does not require locking. */ -static void +static int zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) { lr_t *lr, *lrb; @@ -2324,7 +2404,7 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) lrw = (lr_write_t *)lr; if (lr->lrc_txtype == TX_COMMIT) - return; + return (0); reclen = lr->lrc_reclen; dlen = zil_itx_data_size(itx); @@ -2410,16 +2490,35 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ". Falling back to txg_wait_synced().", error); zfs_fallthrough; - case EIO: - txg_wait_synced(zilog->zl_dmu_pool, - lr->lrc_txg); + case EIO: { + int error = txg_wait_synced_flags( + zilog->zl_dmu_pool, + lr->lrc_txg, TXG_WAIT_SUSPEND); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + /* + * zil_lwb_commit() is called from a + * loop over a list of itxs at the + * top of zil_lwb_write_issue(), which + * itself is called from a loop over a + * list of lwbs in various places. + * zil_crash() will free those itxs + * and sometimes the lwbs, so they + * are invalid when zil_crash() returns. + * Callers must pretty much abort + * immediately. + */ + zil_crash(zilog); + return (error); + } zfs_fallthrough; + } case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return; + return (0); } } } @@ -2427,6 +2526,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) lwb->lwb_nfilled += reclen + dlen; ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); + + return (0); } itx_t * @@ -2468,7 +2569,7 @@ zil_itx_clone(itx_t *oitx) } void -zil_itx_destroy(itx_t *itx) +zil_itx_destroy(itx_t *itx, int err) { ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); ASSERT3U(itx->itx_lr.lrc_reclen, ==, @@ -2477,7 +2578,7 @@ zil_itx_destroy(itx_t *itx) IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); if (itx->itx_callback != NULL) - itx->itx_callback(itx->itx_callback_data); + itx->itx_callback(itx->itx_callback_data, err); zio_data_buf_free(itx, itx->itx_size); } @@ -2518,9 +2619,9 @@ zil_itxg_clean(void *arg) * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) - zil_commit_waiter_skip(itx->itx_private); + zil_commit_waiter_done(itx->itx_private, 0); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } cookie = NULL; @@ -2530,7 +2631,7 @@ zil_itxg_clean(void *arg) while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); @@ -2592,7 +2693,7 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } list_destroy(&clean_list); } @@ -2677,6 +2778,68 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } /* + * Post-crash cleanup. This is called from zil_clean() because it needs to + * do cleanup after every txg until the ZIL is restarted, and zilog_dirty() + * can arrange that easily, unlike zil_sync() which is more complicated to + * get a call to without actual dirty data. + */ +static void +zil_crash_clean(zilog_t *zilog, uint64_t synced_txg) +{ + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + ASSERT3U(zilog->zl_restart_txg, >, 0); + + /* Clean up anything on the crash list from earlier txgs */ + lwb_t *lwb; + while ((lwb = list_head(&zilog->zl_lwb_crash_list)) != NULL) { + if (lwb->lwb_alloc_txg >= synced_txg || + lwb->lwb_max_txg >= synced_txg) { + /* + * This lwb was allocated or updated on this txg, or + * in the future. We stop processing here, to avoid + * the strange situation of freeing a ZIL block on + * on the same or earlier txg than what it was + * allocated for. + * + * We'll take care of it on the next txg. + */ + break; + } + + /* This LWB is from the past, so we can clean it up now. */ + ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED); + list_remove(&zilog->zl_lwb_crash_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + /* + * Free on the next txg, since zil_clean() is called + * once synced_txg has already been completed. + */ + zio_free(zilog->zl_spa, synced_txg+1, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + } + + if (zilog->zl_restart_txg > synced_txg) { + /* + * Not reached the restart txg yet, so mark the ZIL dirty for + * the next txg and we'll consider it all again then. + */ + zilog_dirty(zilog, synced_txg+1); + return; + } + + /* + * Reached the restart txg, so we can allow new calls to zil_commit(). + * All ZIL txgs have long past so there should be no IO waiting. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + ASSERT(list_is_empty(&zilog->zl_lwb_crash_list)); + + zilog->zl_restart_txg = 0; +} + +/* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been committed) so that @@ -2691,6 +2854,15 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3U(synced_txg, <, ZILTEST_TXG); + /* Do cleanup and restart after crash. */ + if (zilog->zl_restart_txg > 0) { + mutex_enter(&zilog->zl_lock); + /* Make sure we didn't lose a race. */ + if (zilog->zl_restart_txg > 0) + zil_crash_clean(zilog, synced_txg); + mutex_exit(&zilog->zl_lock); + } + mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); @@ -2875,7 +3047,7 @@ zil_prune_commit_list(zilog_t *zilog) * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ - zil_commit_waiter_skip(itx->itx_private); + zil_commit_waiter_done(itx->itx_private, 0); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); } @@ -2883,13 +3055,13 @@ zil_prune_commit_list(zilog_t *zilog) mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } -static void +static int zil_commit_writer_stall(zilog_t *zilog) { /* @@ -2914,8 +3086,22 @@ zil_commit_writer_stall(zilog_t *zilog) */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ZIL_STAT_BUMP(zilog, zil_commit_stall_count); - txg_wait_synced(zilog->zl_dmu_pool, 0); + + int err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + if (err != 0) { + ASSERT3U(err, ==, ESHUTDOWN); + zil_crash(zilog); + } + + /* + * Either zil_sync() has been called to wait for and clean up any + * in-flight LWBs, or zil_crash() has emptied out the list and arranged + * for them to be cleaned up later. + */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); + + return (err); } static void @@ -3082,7 +3268,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); zilog->zl_cur_left -= zil_itx_full_size(itx); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } } @@ -3092,10 +3278,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. + * + * ESHUTDOWN has to be handled carefully here. If we get it, + * then the pool suspended and zil_crash() was called, so we + * need to stop trying and just get an error back to the + * callers. */ - while ((lwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); + int err = 0; + while ((lwb = list_remove_head(ilwbs)) != NULL) { + if (err == 0) + err = zil_lwb_write_issue(zilog, lwb); + } + if (err != ESHUTDOWN) + err = zil_commit_writer_stall(zilog); + if (err == ESHUTDOWN) + err = SET_ERROR(EIO); /* * Additionally, we have to signal and mark the "nolwb" @@ -3105,7 +3302,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) */ zil_commit_waiter_t *zcw; while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) - zil_commit_waiter_skip(zcw); + zil_commit_waiter_done(zcw, err); /* * And finally, we have to destroy the itx's that @@ -3113,7 +3310,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * the itx's callback if one exists for the itx. */ while ((itx = list_remove_head(&nolwb_itxs)) != NULL) - zil_itx_destroy(itx); + zil_itx_destroy(itx, err); } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); @@ -3167,11 +3364,17 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) (!zilog->zl_parallel || zilog->zl_suspend > 0)) { zil_burst_done(zilog); list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) { - while ((lwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); + int err = 0; + while ((lwb = + list_remove_head(ilwbs)) != NULL) { + if (err == 0) + err = zil_lwb_write_issue( + zilog, lwb); + } + if (err != ESHUTDOWN) + (void) zil_commit_writer_stall(zilog); } } } @@ -3230,10 +3433,23 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); + /* + * If the ZIL failed somewhere inside zil_process_commit_list(), it's + * will be because a fallback to txg_wait_sync_flags() happened at some + * point (eg zil_commit_writer_stall()). All cases should issue and + * empty ilwbs, so there will be nothing to in the issue loop below. + * That's why we don't have to plumb the error value back from + * zil_process_commit_list(), and don't have to skip it. + */ + IMPLY(zilog->zl_restart_txg > 0, list_is_empty(&ilwbs)); + out: mutex_exit(&zilog->zl_issuer_lock); - while ((lwb = list_remove_head(&ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); + int err = 0; + while ((lwb = list_remove_head(&ilwbs)) != NULL) { + if (err == 0) + err = zil_lwb_write_issue(zilog, lwb); + } list_destroy(&ilwbs); return (wtxg); } @@ -3326,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * hasn't been issued. */ zil_burst_done(zilog); - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); @@ -3402,7 +3618,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be - * marked done until zil_commit_waiter_skip() is called via + * marked done until zil_commit_waiter_done() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the @@ -3480,7 +3696,7 @@ zil_alloc_commit_waiter(void) list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; - zcw->zcw_zio_error = 0; + zcw->zcw_error = 0; return (zcw); } @@ -3489,7 +3705,7 @@ static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT0P(zcw->zcw_lwb); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); @@ -3526,6 +3742,99 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) } /* + * Crash the ZIL. This is something like suspending, but abandons the ZIL + * without further IO until the wanted txg completes. No effort is made to + * close the on-disk chain or do any other on-disk work, as the pool may + * have suspended. zil_sync() will handle cleanup as normal and restart the + * ZIL once enough txgs have passed. + */ +static void +zil_crash(zilog_t *zilog) +{ + mutex_enter(&zilog->zl_lock); + + uint64_t txg = spa_syncing_txg(zilog->zl_spa); + uint64_t restart_txg = + spa_syncing_txg(zilog->zl_spa) + TXG_CONCURRENT_STATES; + + if (zilog->zl_restart_txg > 0) { + /* + * If the ZIL is already crashed, it's almost certainly because + * we lost a race involving multiple callers from + * zil_commit_impl(). + */ + + /* + * This sanity check is to support my understanding that in the + * event of multiple callers to zil_crash(), only one of them + * can possibly be in the codepath to issue lwbs; the rest + * should be calling from zil_commit_impl() after their waiters + * have completed. As I understand it, a second thread trying + * to issue will eventually wait on zl_issuer_lock, and then + * have no work to do and leave. + * + * If more lwbs had been created an issued between zil_crash() + * calls, then we probably just need to take those too, add + * them to the crash list and clean them up, but it complicates + * this function and I don't think it can happend. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + + mutex_exit(&zilog->zl_lock); + return; + } + + zilog->zl_restart_txg = restart_txg; + + /* + * Capture any live LWBs. Depending on the state of the pool they may + * represent in-flight IO that won't return for some time, and we want + * to make sure they don't get in the way of normal ZIL operation. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_crash_list)); + list_move_tail(&zilog->zl_lwb_crash_list, &zilog->zl_lwb_list); + + /* + * Run through the LWB list; erroring all itxes and signalling error + * to all waiters. + */ + for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL; + lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) { + ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED)); + lwb->lwb_flags |= LWB_FLAG_CRASHED; + + itx_t *itx; + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) + zil_itx_destroy(itx, EIO); + + zil_commit_waiter_t *zcw; + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { + mutex_enter(&zcw->zcw_lock); + zcw->zcw_lwb = NULL; + zcw->zcw_error = EIO; + zcw->zcw_done = B_TRUE; + cv_broadcast(&zcw->zcw_cv); + mutex_exit(&zcw->zcw_lock); + } + } + + /* + * Zero the ZIL header bp after the ZIL restarts. We'll free it in + * zil_clean() when we clean up the lwbs. + */ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + BP_ZERO(&zh->zh_log); + + /* + * Mark this ZIL dirty on the next txg, so that zil_clean() will be + * called for cleanup. + */ + zilog_dirty(zilog, txg+1); + + mutex_exit(&zilog->zl_lock); +} + +/* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the @@ -3640,9 +3949,17 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * but the order in which they complete will be the same order in * which they were created. */ -void +static int zil_commit_impl(zilog_t *zilog, uint64_t foid); + +int zil_commit(zilog_t *zilog, uint64_t foid) { + return (zil_commit_flags(zilog, foid, ZIL_COMMIT_FAILMODE)); +} + +int +zil_commit_flags(zilog_t *zilog, uint64_t foid, zil_commit_flag_t flags) +{ /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: @@ -3659,7 +3976,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) - return; + return (0); if (!spa_writeable(zilog->zl_spa)) { /* @@ -3670,10 +3987,23 @@ zil_commit(zilog_t *zilog, uint64_t foid) * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); - ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + ASSERT0P(zilog->zl_last_lwb_opened); for (int i = 0; i < TXG_SIZE; i++) - ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); - return; + ASSERT0P(zilog->zl_itxg[i].itxg_itxs); + return (0); + } + + int err = 0; + + /* + * If the ZIL crashed, bypass it entirely, and rely on txg_wait_sync() + * to get the data out to disk. + */ + if (zilog->zl_restart_txg > 0) { + ZIL_STAT_BUMP(zilog, zil_commit_crash_count); + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + goto out; } /* @@ -3685,14 +4015,43 @@ zil_commit(zilog_t *zilog, uint64_t foid) */ if (zilog->zl_suspend > 0) { ZIL_STAT_BUMP(zilog, zil_commit_suspend_count); - txg_wait_synced(zilog->zl_dmu_pool, 0); - return; + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + if (err != 0) { + ASSERT3U(err, ==, ESHUTDOWN); + zil_crash(zilog); + } + goto out; } - zil_commit_impl(zilog, foid); + err = zil_commit_impl(zilog, foid); + +out: + if (err == 0) + return (0); + + /* + * The ZIL write failed and the pool is suspended. There's nothing else + * we can do except return or block. + */ + ASSERT3U(err, ==, ESHUTDOWN); + + /* + * Return error if failmode=continue or caller will handle directly. + */ + if (!(flags & ZIL_COMMIT_FAILMODE) || + spa_get_failmode(zilog->zl_spa) == ZIO_FAILURE_MODE_CONTINUE) + return (SET_ERROR(EIO)); + + /* + * Block until the pool returns. We assume that the data will make + * it out to disk in the end, and so return success. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + return (0); } -void +static int zil_commit_impl(zilog_t *zilog, uint64_t foid) { ZIL_STAT_BUMP(zilog, zil_commit_count); @@ -3729,7 +4088,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); - if (zcw->zcw_zio_error != 0) { + int err = 0; + if (zcw->zcw_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to @@ -3741,13 +4101,29 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) ZIL_STAT_BUMP(zilog, zil_commit_error_count); DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); - txg_wait_synced(zilog->zl_dmu_pool, 0); + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); } else if (wtxg != 0) { ZIL_STAT_BUMP(zilog, zil_commit_suspend_count); - txg_wait_synced(zilog->zl_dmu_pool, wtxg); + err = txg_wait_synced_flags(zilog->zl_dmu_pool, wtxg, + TXG_WAIT_SUSPEND); } zil_free_commit_waiter(zcw); + + if (err == 0) + return (0); + + /* + * ZIL write failed and pool failed in the fallback to + * txg_wait_synced_flags(). Right now we have no idea if the data is on + * disk and the pool is probably suspended so we have no idea when it's + * coming back. All we can do is shut down and return error to the + * caller. + */ + ASSERT3U(err, ==, ESHUTDOWN); + zil_crash(zilog); + return (err); } /* @@ -3773,7 +4149,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) mutex_enter(&zilog->zl_lock); - ASSERT(zilog->zl_stop_sync == 0); + ASSERT0(zilog->zl_stop_sync); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); @@ -3848,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); - mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -3857,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; - mutex_destroy(&lwb->lwb_vdev_lock); + mutex_destroy(&lwb->lwb_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); @@ -3943,6 +4319,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); + list_create(&zilog->zl_lwb_crash_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -3967,9 +4345,12 @@ zil_free(zilog_t *zilog) ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); + ASSERT0(zilog->zl_restart_txg); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); + ASSERT(list_is_empty(&zilog->zl_lwb_crash_list)); + list_destroy(&zilog->zl_lwb_crash_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); @@ -4005,8 +4386,8 @@ zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); - ASSERT3P(zilog->zl_get_data, ==, NULL); - ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + ASSERT0P(zilog->zl_get_data); + ASSERT0P(zilog->zl_last_lwb_opened); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; @@ -4025,7 +4406,8 @@ zil_close(zilog_t *zilog) uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { - zil_commit(zilog, 0); + if (zil_commit_flags(zilog, 0, ZIL_COMMIT_NOW) != 0) + txg_wait_synced(zilog->zl_dmu_pool, 0); } else { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); @@ -4074,7 +4456,7 @@ zil_close(zilog_t *zilog) if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + ASSERT0P(lwb->lwb_buf); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); @@ -4126,6 +4508,17 @@ zil_suspend(const char *osname, void **cookiep) return (SET_ERROR(EBUSY)); } + if (zilog->zl_restart_txg > 0) { + /* + * ZIL crashed. It effectively _is_ suspended, but callers + * are usually trying to make sure it's empty on-disk, which + * we can't guarantee right now. + */ + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (SET_ERROR(EBUSY)); + } + /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume @@ -4154,11 +4547,16 @@ zil_suspend(const char *osname, void **cookiep) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); - if (cookiep == NULL) + if (zilog->zl_restart_txg > 0) { + /* ZIL crashed while we were waiting. */ + zil_resume(os); + error = SET_ERROR(EBUSY); + } else if (cookiep == NULL) zil_resume(os); else *cookiep = os; - return (0); + + return (error); } /* @@ -4199,17 +4597,34 @@ zil_suspend(const char *osname, void **cookiep) * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. + * + * However, zil_commit_impl() itself can return an error if any of the + * lwbs fail, or the pool suspends in the fallback + * txg_wait_sync_flushed(), which affects what we do next, so we + * capture that error. */ - zil_commit_impl(zilog, 0); + error = zil_commit_impl(zilog, 0); + if (error == ESHUTDOWN) + /* zil_commit_impl() has called zil_crash() already */ + error = SET_ERROR(EBUSY); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ - txg_wait_synced(zilog->zl_dmu_pool, 0); + if (error == 0) { + error = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + zil_crash(zilog); + error = SET_ERROR(EBUSY); + } + } - zil_destroy(zilog, B_FALSE); + if (error == 0) + zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; @@ -4223,7 +4638,8 @@ zil_suspend(const char *osname, void **cookiep) zil_resume(os); else *cookiep = os; - return (0); + + return (error); } void @@ -4386,7 +4802,7 @@ zil_replay(objset_t *os, void *arg, zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); - ASSERT(zilog->zl_replay_blks == 0); + ASSERT0(zilog->zl_replay_blks); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg, B_TRUE); vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); |