diff options
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/Makefile.in | 9 | ||||
-rw-r--r-- | module/zfs/abd.c | 17 | ||||
-rw-r--r-- | module/zfs/arc.c | 73 | ||||
-rw-r--r-- | module/zfs/dmu_recv.c | 48 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 8 | ||||
-rw-r--r-- | module/zfs/dmu_tx.c | 105 | ||||
-rw-r--r-- | module/zfs/dnode.c | 43 | ||||
-rw-r--r-- | module/zfs/dsl_deadlist.c | 26 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 56 | ||||
-rw-r--r-- | module/zfs/mmp.c | 2 | ||||
-rw-r--r-- | module/zfs/spa.c | 307 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 30 | ||||
-rw-r--r-- | module/zfs/vdev.c | 36 | ||||
-rw-r--r-- | module/zfs/vdev_indirect.c | 2 | ||||
-rw-r--r-- | module/zfs/vdev_initialize.c | 66 | ||||
-rw-r--r-- | module/zfs/vdev_label.c | 13 | ||||
-rw-r--r-- | module/zfs/vdev_rebuild.c | 27 | ||||
-rw-r--r-- | module/zfs/vdev_trim.c | 28 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 3 | ||||
-rw-r--r-- | module/zfs/zfs_vnops.c | 10 | ||||
-rw-r--r-- | module/zfs/zil.c | 60 | ||||
-rw-r--r-- | module/zfs/zio.c | 2 |
22 files changed, 825 insertions, 146 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 653ea0da9bcc..d9b86890b5f5 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -154,4 +154,13 @@ ifeq ($(CONFIG_ALTIVEC),y) $(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec endif +ifeq ($(CONFIG_ARM64),y) +CFLAGS_REMOVE_vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only +CFLAGS_REMOVE_vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only +endif + +UBSAN_SANITIZE_zap_leaf.o := n +UBSAN_SANITIZE_zap_micro.o := n +UBSAN_SANITIZE_sa.o := n + include $(mfdir)/../os/linux/zfs/Makefile diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 8ee8e7e57420..754974a559b6 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -109,7 +109,6 @@ void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG - ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | @@ -118,6 +117,7 @@ abd_verify(abd_t *abd) IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { + ASSERT3U(abd->abd_size, >, 0); ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else if (abd_is_gang(abd)) { uint_t child_sizes = 0; @@ -130,6 +130,7 @@ abd_verify(abd_t *abd) } ASSERT3U(abd->abd_size, ==, child_sizes); } else { + ASSERT3U(abd->abd_size, >, 0); abd_verify_scatter(abd); } #endif @@ -369,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) * will retain all the free_on_free settings after being * added to the parents list. */ +#ifdef ZFS_DEBUG + /* + * If cabd had abd_parent, we have to drop it here. We can't + * transfer it to pabd, nor we can clear abd_size leaving it. + */ + if (cabd->abd_parent != NULL) { + (void) zfs_refcount_remove_many( + &cabd->abd_parent->abd_children, + cabd->abd_size, cabd); + cabd->abd_parent = NULL; + } +#endif pabd->abd_size += cabd->abd_size; + cabd->abd_size = 0; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); @@ -407,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); - ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6900b6b134d9..1180853da038 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -946,7 +946,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ @@ -8415,7 +8415,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size, tsize; + uint64_t size; /* * Make sure our globals have meaningful values in case the user @@ -8432,18 +8432,23 @@ l2arc_write_size(l2arc_dev_t *dev) if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* We need to add in the worst case scenario of log block overhead. */ + size += l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the writesize, whichever is greater. + */ + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); + } + /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - dev_size = dev->l2ad_end - dev->l2ad_start; - tsize = size + l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) - tsize += MAX(64 * 1024 * 1024, - (tsize * l2arc_trim_ahead) / 100); - - if (tsize >= dev_size) { + if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " @@ -8452,8 +8457,19 @@ l2arc_write_size(l2arc_dev_t *dev) dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + if (l2arc_trim_ahead > 1) { + cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); + l2arc_trim_ahead = 1; + } + if (arc_warm == B_FALSE) size += l2arc_write_boost; + + size += l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); + } } return (size); @@ -9074,22 +9090,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) buflist = &dev->l2ad_buflist; - /* - * We need to add in the worst case scenario of log block overhead. - */ - distance += l2arc_log_blk_overhead(distance, dev); - if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { - /* - * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) - * times the write size, whichever is greater. - */ - distance += MAX(64 * 1024 * 1024, - (distance * l2arc_trim_ahead) / 100); - } - top: rerun = B_FALSE; - if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { + if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands @@ -9283,7 +9286,7 @@ out: */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } @@ -9549,7 +9552,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); - if ((write_asize + asize) > target_sz) { + /* + * If the allocated size of this buffer plus the max + * size for the pending log block exceeds the evicted + * target size, terminate writing buffers for this run. + */ + if (write_asize + asize + + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -9669,8 +9678,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * arcstat_l2_{size,asize} kstats are updated * internally. */ - if (l2arc_log_blk_insert(dev, hdr)) - l2arc_log_blk_commit(dev, pio, cb); + if (l2arc_log_blk_insert(dev, hdr)) { + /* + * l2ad_hand will be adjusted in + * l2arc_log_blk_commit(). + */ + write_asize += + l2arc_log_blk_commit(dev, pio, cb); + } zio_nowait(wzio); } @@ -10820,7 +10835,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ -static void +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; @@ -10933,6 +10948,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; + + return (asize); } /* diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 98ca2b3bcec1..6eb1009a788b 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -71,6 +71,12 @@ int zfs_recv_write_batch_size = 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; +typedef enum { + ORNS_NO, + ORNS_YES, + ORNS_MAYBE +} or_need_sync_t; + static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); @@ -121,6 +127,9 @@ struct receive_writer_arg { uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; + + /* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */ + or_need_sync_t or_need_sync; }; typedef struct dmu_recv_begin_arg { @@ -1524,17 +1533,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa, } /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. + * The dmu does not currently support decreasing nlevels or changing + * indirect block size if there is already one, same as changing the + * number of of dnode slots on an object. For non-raw sends this + * does not matter and the new object can just use the previous one's + * parameters. For raw sends, however, the structure of the received + * dnode (including indirects and dnode slots) must match that of the + * send side. Therefore, instead of using dmu_object_reclaim(), we + * must free the object completely and call dmu_object_claim_dnsize() + * instead. */ - if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + if ((rwa->raw && ((doi->doi_indirection > 1 && + indblksz != doi->doi_metadata_block_size) || + drro->drr_nlevels < doi->doi_indirection)) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) @@ -1658,10 +1669,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, /* object was freed and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } else { + /* + * If the only record in this range so far was DRR_FREEOBJECTS + * with at least one actually freed object, it's possible that + * the block will now be converted to a hole. We need to wait + * for the txg to sync to prevent races. + */ + if (rwa->or_need_sync == ORNS_YES) + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + /* object is free and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } + /* Only relevant for the first object in the range */ + rwa->or_need_sync = ORNS_NO; + /* * If this is a multi-slot dnode there is a chance that this * object will expand into a slot that is already used by @@ -1856,6 +1879,9 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); + + if (rwa->or_need_sync == ORNS_MAYBE) + rwa->or_need_sync = ORNS_YES; } if (next_err != ESRCH) return (next_err); @@ -2298,6 +2324,8 @@ receive_object_range(struct receive_writer_arg *rwa, bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); rwa->or_byteorder = byteorder; + rwa->or_need_sync = ORNS_MAYBE; + return (0); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index cd9ecc07fd5c..0dd1ec210a1d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2797,6 +2797,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } if (err == 0) { + owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2810,21 +2811,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, sizeof (dspp.saved_toname), dspp.saved_toname); } - if (err != 0) + /* Only disown if there was an error in the lookups */ + if (owned && (err != 0)) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); + if (err == 0) + owned = B_TRUE; } - owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { + /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1eed0526b51d..063934f39493 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -291,6 +291,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } static void +dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the append; first level-0 block (if not aligned, i.e. + * if they are partial-block writes), no additional blocks are read. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + +static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) zfs_refcount_add_many(&txh->txh_space_towrite, @@ -331,6 +378,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) } /* + * Should be used when appending to an object and the exact offset is unknown. + * The write must occur at or beyond the specified offset. Only the L0 block + * at provided offset will be prefetched. + */ +void +dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +/* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead @@ -638,6 +721,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) if (blkid == 0) match_offset = TRUE; break; + case THT_APPEND: + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + + /* + * THT_WRITE used for bonus and spill blocks. + */ + ASSERT(blkid != DMU_BONUS_BLKID && + blkid != DMU_SPILL_BLKID); + + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; case THT_FREE: /* * We will dirty all the level 1 blocks in @@ -1421,6 +1524,8 @@ dmu_tx_fini(void) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_append); +EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ed75c3bdf698..efebc443a210 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1773,7 +1773,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode contains any uncommitted dirty records. + * Checks if the dnode itself is dirty, or is carrying any uncommitted records. + * It is important to check both conditions, as some operations (eg appending + * to a file) can dirty both as a single logical unit, but they are not synced + * out atomically, so checking one and not the other can result in an object + * appearing to be clean mid-way through a commit. + * + * Do not change this lightly! If you get it wrong, dmu_offset_next() can + * detect a hole where there is really data, leading to silent corruption. */ boolean_t dnode_is_dirty(dnode_t *dn) @@ -1781,7 +1788,8 @@ dnode_is_dirty(dnode_t *dn) mutex_enter(&dn->dn_mtx); for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i])) { + if (multilist_link_active(&dn->dn_dirty_link[i]) || + !list_is_empty(&dn->dn_dirty_records[i])) { mutex_exit(&dn->dn_mtx); return (B_TRUE); } @@ -1891,7 +1899,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs == dn->dn_indblkshift) ibs = 0; - if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + if (size == dn->dn_datablksz && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); @@ -1914,24 +1922,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs && dn->dn_nlevels != 1) goto fail; - /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) { - dbuf_new_size(db, size, tx); - } else if (err != ENOENT) { - goto fail; - } - - dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (size != dn->dn_datablksz) { + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); + if (err == 0) { + dbuf_new_size(db, size, tx); + } else if (err != ENOENT) { + goto fail; + } + + dnode_setdblksz(dn, size); + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size; + if (db) + dbuf_rele(db, FTAG); + } if (ibs) { dn->dn_indblkshift = ibs; - dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; } - /* release after we have fixed the blocksize in the dnode */ - if (db) - dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index d5fe2ee56804..9827eb14728d 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -859,7 +859,7 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { zap_cursor_t zc, pzc; - zap_attribute_t za, pza; + zap_attribute_t *za, *pza; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; @@ -874,28 +874,31 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) return; } + za = kmem_alloc(sizeof (*za), KM_SLEEP); + pza = kmem_alloc(sizeof (*pza), KM_SLEEP); + mutex_enter(&dl->dl_lock); /* * Prefetch up to 128 deadlists first and then more as we progress. * The limit is a balance between ARC use and diminishing returns. */ for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; - (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128; + (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128; zap_cursor_advance(&pzc), i++) { - dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, - zfs_strtonum(pza.za_name, NULL)); + dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, + zfs_strtonum(pza->za_name, NULL)); } for (zap_cursor_init(&zc, dl->dl_os, obj); - (error = zap_cursor_retrieve(&zc, &za)) == 0; + (error = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { - uint64_t mintxg = zfs_strtonum(za.za_name, NULL); - dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); + uint64_t mintxg = zfs_strtonum(za->za_name, NULL); + dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx); VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); if (perror == 0) { - dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, - zfs_strtonum(pza.za_name, NULL)); + dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, + zfs_strtonum(pza->za_name, NULL)); zap_cursor_advance(&pzc); - perror = zap_cursor_retrieve(&pzc, &pza); + perror = zap_cursor_retrieve(&pzc, pza); } } VERIFY3U(error, ==, ENOENT); @@ -908,6 +911,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); mutex_exit(&dl->dl_lock); + + kmem_free(za, sizeof (*za)); + kmem_free(pza, sizeof (*pza)); } /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f3c639b0d04e..f0a851ff53a9 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -37,6 +37,7 @@ #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> #include <sys/arc.h> +#include <sys/arc_impl.h> #include <sys/zap.h> #include <sys/zio.h> #include <sys/zfs_context.h> @@ -126,12 +127,21 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_data_disks(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(spa_t *spa); extern int zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; /* + * 'zpool status' uses bytes processed per pass to report throughput and + * estimate time remaining. We define a pass to start when the scanning + * phase completes for a sequential resilver. Optionally, this value + * may be used to reset the pass statistics every N txgs to provide an + * estimated completion time based on currently observed performance. + */ +static uint_t zfs_scan_report_txgs = 0; + +/* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed * this value can be set to 1 to enable checking before scanning each @@ -147,7 +157,7 @@ int zfs_scan_strict_mem_lim = B_FALSE; * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ -unsigned long zfs_scan_vdev_limit = 4 << 20; +unsigned long zfs_scan_vdev_limit = 16 << 20; int zfs_scan_issue_strategy = 0; int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ @@ -450,11 +460,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) /* * Calculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); @@ -584,6 +595,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) } spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); + return (0); } @@ -742,6 +755,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; @@ -2797,8 +2811,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static uint64_t -dsl_scan_count_data_disks(vdev_t *rvd) +dsl_scan_count_data_disks(spa_t *spa) { + vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { @@ -3638,6 +3653,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) } /* + * Disabled by default, set zfs_scan_report_txgs to report + * average performance over the last zfs_scan_report_txgs TXGs. + */ + if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && + tx->tx_txg % zfs_scan_report_txgs == 0) { + scn->scn_issued_before_pass += spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); + } + + /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from * a checkpoint after a reboot. @@ -3693,12 +3718,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) taskqid_t prefetch_tqid; /* - * Recalculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { @@ -3759,6 +3785,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; + scn->scn_issued_before_pass += + spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); } zfs_dbgmsg("scan complete txg %llu", (longlong_t)tx->tx_txg); @@ -4485,6 +4514,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, + "Tunable to report resilver performance over the last N txgs"); + ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); /* END CSTYLED */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index f67a4eb22a2d..139bb0acd277 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -444,7 +444,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1ed79eed3e8b..81a6547896ac 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -33,6 +33,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ /* @@ -150,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ -const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ @@ -1110,6 +1111,275 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { + "fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ + int err = 0; + + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_BATCH: { + const zio_taskq_info_t zti = ZTI_BATCH; + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; + } + + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + if (add_newline) + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); +} +#else +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention @@ -6261,6 +6531,16 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } + /* + * spa_import() relies on a pool config fetched by spa_try_import() + * for spare/cache devices. Import flags are not passed to + * spa_tryimport(), which makes it return early due to a missing log + * device and missing retrieving the cache device and spare eventually. + * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch + * the correct configuration regardless of the missing log device. + */ + spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* @@ -6747,9 +7027,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - if (dsl_scan_resilvering(spa_get_dsl(spa))) + if (dsl_scan_resilvering(spa_get_dsl(spa)) || + dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); + } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, @@ -6987,7 +7269,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent - * is a replacing vdev. + * is a replacing or a spare vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) @@ -7294,6 +7576,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_UNINIT && + vd->vdev_initialize_thread != NULL) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(EBUSY)); } switch (cmd_type) { @@ -7306,6 +7592,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; + case POOL_INITIALIZE_UNINIT: + vdev_uninitialize(vd); + break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } @@ -8210,7 +8499,8 @@ spa_async_thread(void *arg) * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || - tasks & SPA_ASYNC_REBUILD_DONE) { + tasks & SPA_ASYNC_REBUILD_DONE || + tasks & SPA_ASYNC_DETACH_SPARE) { spa_vdev_resilver_done(spa); } @@ -9986,4 +10276,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a57f0727db31..113943026d59 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -494,8 +494,9 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) return (1); } -void -spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +static void +spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, + int mmp_flag) { (void) tag; int wlocks_held = 0; @@ -510,7 +511,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { - while (scl->scl_writer || scl->scl_write_wanted) { + while (scl->scl_writer || + (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -529,6 +531,27 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } void +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_enter_impl(spa, locks, tag, rw, 0); +} + +/* + * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * outstanding write lock requests. This is needed since the mmp updates are + * time sensitive and failure to service them promptly will result in a + * suspended pool. This pool suspension has been seen in practice when there is + * a single disk in a pool that is responding slowly and presumably about to + * fail. + */ + +void +spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_enter_impl(spa, locks, tag, rw, 1); +} + +void spa_config_exit(spa_t *spa, int locks, const void *tag) { (void) tag; @@ -2564,7 +2587,6 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; - vdev_scan_stat_init(spa->spa_root_vdev); } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4b9d7e7c0506..57259b8ce88e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -28,7 +28,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include <sys/zfs_context.h> @@ -2646,6 +2646,17 @@ vdev_reopen(vdev_t *vd) } /* + * Recheck if resilver is still needed and cancel any + * scheduled resilver if resilver is unneeded. + */ + if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa->spa_async_tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; + mutex_exit(&spa->spa_async_lock); + } + + /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); @@ -3983,11 +3994,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* - * If the vdev is already removed, then don't do anything. + * If the vdev is already removed, or expanding which can trigger + * repartition add/remove events, then don't do anything. */ - if (vd->vdev_removed) + if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); + /* + * Confirm the vdev has been removed, otherwise don't do anything. + */ + if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); + vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); @@ -4085,9 +4103,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && - vd->vdev_state >= VDEV_STATE_DEGRADED)) + vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); + /* + * Asynchronously detach spare vdev if resilver or + * rebuild is not required + */ + if (vd->vdev_unspare && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && + !vdev_rebuild_active(tvd)) + spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); + } return (spa_vdev_state_exit(spa, vd, 0)); } diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 8762855d46aa..9e4c115f212c 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -270,7 +270,7 @@ typedef struct indirect_split { */ indirect_child_t *is_good_child; - indirect_child_t is_child[1]; /* variable-length */ + indirect_child_t is_child[]; } indirect_split_t; /* diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 6ffd0d618fdd..5d90fd67cc2f 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -101,6 +101,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) } static void +vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx) +{ + uint64_t guid = *(uint64_t *)arg; + + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE); + ASSERT3U(vd->vdev_leaf_zap, !=, 0); + + vd->vdev_initialize_last_offset = 0; + vd->vdev_initialize_action_time = 0; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + int error; + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx); + VERIFY(error == 0 || error == ENOENT); + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, tx); + VERIFY(error == 0 || error == ENOENT); + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx); + VERIFY(error == 0 || error == ENOENT); +} + +static void vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); @@ -127,8 +160,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, tx); + + if (new_state != VDEV_INITIALIZE_NONE) { + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, tx); + } else { + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_remove_sync, guid, tx); + } switch (new_state) { case VDEV_INITIALIZE_ACTIVE: @@ -149,6 +188,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) spa_history_log_internal(spa, "initialize", tx, "vdev=%s complete", vd->vdev_path); break; + case VDEV_INITIALIZE_NONE: + spa_history_log_internal(spa, "uninitialize", tx, + "vdev=%s", vd->vdev_path); + break; default: panic("invalid state %llu", (unsigned long long)new_state); } @@ -605,6 +648,24 @@ vdev_initialize(vdev_t *vd) } /* + * Uninitializes a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_uninitialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE); +} + +/* * Wait for the initialize thread to be terminated (cancelled or stopped). */ static void @@ -760,6 +821,7 @@ vdev_initialize_restart(vdev_t *vd) } EXPORT_SYMBOL(vdev_initialize); +EXPORT_SYMBOL(vdev_uninitialize); EXPORT_SYMBOL(vdev_initialize_stop); EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index ec6bbc6fc610..277c14ec1ad7 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -468,6 +468,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + if (flags & VDEV_CONFIG_L2CACHE) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, @@ -1100,6 +1103,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + + /* + * This is merely to facilitate reporting the ashift of the + * cache device through zdb. The actual retrieval of the + * ashift (in vdev_alloc()) uses the nvlist + * spa->spa_l2cache->sav_config (populated in + * spa_ld_open_aux_vdevs()). + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); } else { uint64_t txg = 0ULL; diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 9dfbe0cf6f30..b180fa14682e 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -34,6 +34,7 @@ #include <sys/zio.h> #include <sys/dmu_tx.h> #include <sys/arc.h> +#include <sys/arc_impl.h> #include <sys/zap.h> /* @@ -116,13 +117,12 @@ unsigned long zfs_rebuild_max_segment = 1024 * 1024; * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * - * 32MB was selected as the default value to achieve good performance with - * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential - * rebuild was unable to saturate all of the drives using smaller values. - * With a value of 32MB the sequential resilver write rate was measured at - * 800MB/s sustained while rebuilding to a distributed spare. + * 64MB was observed to deliver the best performance and set as the default. + * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) + * and a rebuild rate of 1.2GB/s was measured to the distribute spare. + * Smaller values were unable to fully saturate the available pool I/O. */ -unsigned long zfs_rebuild_vdev_limit = 32 << 20; +unsigned long zfs_rebuild_vdev_limit = 64 << 20; /* * Automatically start a pool scrub when the last active sequential resilver @@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int error = 0; /* @@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; - vr->vr_bytes_inflight_max = MAX(1ULL << 20, - zfs_rebuild_vdev_limit * vd->vdev_children); - uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -805,6 +803,17 @@ vdev_rebuild_thread(void *arg) vr->vr_scan_msp = msp; /* + * Calculate the max number of in-flight bytes for top-level + * vdev scanning operations (minimum 1MB, maximum 1/4 of + * arc_c_max shared by all top-level vdevs). Limits for the + * issuing phase are done per top-level vdev and are handled + * separately. + */ + uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); + vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children)); + + /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The * vdev_rebuild_cancel_wanted flag is set until the sync task diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 92daed48f3d5..c0ce2ac28dc5 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -23,6 +23,7 @@ * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright 2023 RackTop Systems, Inc. */ #include <sys/spa.h> @@ -572,6 +573,7 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; + int error = 0; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; @@ -591,19 +593,32 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { - int error; - error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { - return (error); + goto done; } } } - return (0); +done: + /* + * Make sure all TRIMs for this metaslab have completed before + * returning. TRIM zios have lower priority over regular or syncing + * zios, so all TRIM zios for this metaslab must complete before the + * metaslab is re-enabled. Otherwise it's possible write zios to + * this metaslab could cut ahead of still queued TRIM zios for this + * metaslab causing corruption if the ranges overlap. + */ + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[0] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + return (error); } static void @@ -922,11 +937,6 @@ vdev_trim_thread(void *arg) } spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_enter(&vd->vdev_trim_io_lock); - while (vd->vdev_trim_inflight[0] > 0) { - cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); - } - mutex_exit(&vd->vdev_trim_io_lock); range_tree_destroy(ta.trim_tree); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index a4b391cbea12..f441328f3018 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3985,7 +3985,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || - cmd_type == POOL_INITIALIZE_SUSPEND)) { + cmd_type == POOL_INITIALIZE_SUSPEND || + cmd_type == POOL_INITIALIZE_UNINIT)) { return (SET_ERROR(EINVAL)); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b9498d17ee2f..0987fd0f7bb7 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); + atomic_dec_32(&zp->z_sync_writes_cnt); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); @@ -102,7 +104,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) hole = B_FALSE; /* Flush any mmap()'d data to disk */ - if (zn_has_cached_data(zp)) + if (zn_has_cached_data(zp, 0, file_sz - 1)) zn_flush_cached_data(zp, B_FALSE); lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); @@ -275,7 +277,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = mappedread_sf(zp, nbytes, uio); else #endif - if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), @@ -686,7 +689,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } - if (tx_bytes && zn_has_cached_data(zp) && + if (tx_bytes && + zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index aaf509a2fc73..a4f7c008935d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -226,11 +226,10 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, - blkptr_t *nbp, void *dst, char **end) + blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -247,7 +246,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, - &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -262,23 +261,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, */ cksum.zc_word[ZIL_ZC_SEQ]++; + uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); - uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + zilc->zc_nused < sizeof (*zilc) || + zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); - bcopy(lr, dst, len); - *end = (char *)dst + len; + *begin = lr; + *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; - uint64_t size = BP_GET_LSIZE(bp); + char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, @@ -286,15 +285,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(zilc->zc_nused, <=, - SPA_OLD_MAXBLOCKSIZE); - bcopy(lr, dst, zilc->zc_nused); - *end = (char *)dst + zilc->zc_nused; + *begin = lr; + *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } - - arc_buf_destroy(abuf, &abuf); } return (error); @@ -362,7 +357,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk; - char *lrbuf, *lrp; int error = 0; bzero(&next_blk, sizeof (blkptr_t)); @@ -382,13 +376,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *end = NULL; + char *lrp, *end; + arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; @@ -404,8 +398,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, - lrbuf, &end); + &lrp, &end, &abuf); if (error != 0) { + if (abuf) + arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -418,20 +414,25 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; } - for (lrp = lrbuf; lrp < end; lrp += reclen) { + for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); - if (lr->lrc_seq > claim_lr_seq) + if (lr->lrc_seq > claim_lr_seq) { + arc_buf_destroy(abuf, &abuf); goto done; + } error = parse_lr_func(zilog, lr, arg, txg); - if (error != 0) + if (error != 0) { + arc_buf_destroy(abuf, &abuf); goto done; + } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } + arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; @@ -441,7 +442,6 @@ done: zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } @@ -1593,6 +1593,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; } else { wsz = lwb->lwb_sz; @@ -2848,7 +2849,14 @@ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + /* + * Since we are not going to create any new dirty data, and we + * can even help with clearing the existing dirty data, we + * should not be subject to the dirty data based delays. We + * use TXG_NOTHROTTLE to bypass the delay mechanism. + */ + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 700f8791045f..c367ef7211aa 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2287,7 +2287,7 @@ zio_nowait(zio_t *zio) ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && - zio_unique_parent(zio) == NULL) { + list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* |