aboutsummaryrefslogtreecommitdiff
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/Makefile.in9
-rw-r--r--module/zfs/abd.c17
-rw-r--r--module/zfs/arc.c73
-rw-r--r--module/zfs/dmu_recv.c48
-rw-r--r--module/zfs/dmu_send.c8
-rw-r--r--module/zfs/dmu_tx.c105
-rw-r--r--module/zfs/dnode.c43
-rw-r--r--module/zfs/dsl_deadlist.c26
-rw-r--r--module/zfs/dsl_scan.c56
-rw-r--r--module/zfs/mmp.c2
-rw-r--r--module/zfs/spa.c307
-rw-r--r--module/zfs/spa_misc.c30
-rw-r--r--module/zfs/vdev.c36
-rw-r--r--module/zfs/vdev_indirect.c2
-rw-r--r--module/zfs/vdev_initialize.c66
-rw-r--r--module/zfs/vdev_label.c13
-rw-r--r--module/zfs/vdev_rebuild.c27
-rw-r--r--module/zfs/vdev_trim.c28
-rw-r--r--module/zfs/zfs_ioctl.c3
-rw-r--r--module/zfs/zfs_vnops.c10
-rw-r--r--module/zfs/zil.c60
-rw-r--r--module/zfs/zio.c2
22 files changed, 825 insertions, 146 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 653ea0da9bcc..d9b86890b5f5 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -154,4 +154,13 @@ ifeq ($(CONFIG_ALTIVEC),y)
$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec
endif
+ifeq ($(CONFIG_ARM64),y)
+CFLAGS_REMOVE_vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only
+CFLAGS_REMOVE_vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only
+endif
+
+UBSAN_SANITIZE_zap_leaf.o := n
+UBSAN_SANITIZE_zap_micro.o := n
+UBSAN_SANITIZE_sa.o := n
+
include $(mfdir)/../os/linux/zfs/Makefile
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 8ee8e7e57420..754974a559b6 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -109,7 +109,6 @@ void
abd_verify(abd_t *abd)
{
#ifdef ZFS_DEBUG
- ASSERT3U(abd->abd_size, >, 0);
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
@@ -118,6 +117,7 @@ abd_verify(abd_t *abd)
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
+ ASSERT3U(abd->abd_size, >, 0);
ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
} else if (abd_is_gang(abd)) {
uint_t child_sizes = 0;
@@ -130,6 +130,7 @@ abd_verify(abd_t *abd)
}
ASSERT3U(abd->abd_size, ==, child_sizes);
} else {
+ ASSERT3U(abd->abd_size, >, 0);
abd_verify_scatter(abd);
}
#endif
@@ -369,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
* will retain all the free_on_free settings after being
* added to the parents list.
*/
+#ifdef ZFS_DEBUG
+ /*
+ * If cabd had abd_parent, we have to drop it here. We can't
+ * transfer it to pabd, nor we can clear abd_size leaving it.
+ */
+ if (cabd->abd_parent != NULL) {
+ (void) zfs_refcount_remove_many(
+ &cabd->abd_parent->abd_children,
+ cabd->abd_size, cabd);
+ cabd->abd_parent = NULL;
+ }
+#endif
pabd->abd_size += cabd->abd_size;
+ cabd->abd_size = 0;
list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
&ABD_GANG(cabd).abd_gang_chain);
ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
@@ -407,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
*/
if (abd_is_gang(cabd)) {
ASSERT(!list_link_active(&cabd->abd_gang_link));
- ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
return (abd_gang_add_gang(pabd, cabd, free_on_free));
}
ASSERT(!abd_is_gang(cabd));
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 6900b6b134d9..1180853da038 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -946,7 +946,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
-static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
/* L2ARC persistence auxiliary routines. */
@@ -8415,7 +8415,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size, dev_size, tsize;
+ uint64_t size;
/*
* Make sure our globals have meaningful values in case the user
@@ -8432,18 +8432,23 @@ l2arc_write_size(l2arc_dev_t *dev)
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
+ /* We need to add in the worst case scenario of log block overhead. */
+ size += l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
+ /*
+ * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+ * times the writesize, whichever is greater.
+ */
+ size += MAX(64 * 1024 * 1024,
+ (size * l2arc_trim_ahead) / 100);
+ }
+
/*
* Make sure the write size does not exceed the size of the cache
* device. This is important in l2arc_evict(), otherwise infinite
* iteration can occur.
*/
- dev_size = dev->l2ad_end - dev->l2ad_start;
- tsize = size + l2arc_log_blk_overhead(size, dev);
- if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
- tsize += MAX(64 * 1024 * 1024,
- (tsize * l2arc_trim_ahead) / 100);
-
- if (tsize >= dev_size) {
+ if (size > dev->l2ad_end - dev->l2ad_start) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
@@ -8452,8 +8457,19 @@ l2arc_write_size(l2arc_dev_t *dev)
dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
+ if (l2arc_trim_ahead > 1) {
+ cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1");
+ l2arc_trim_ahead = 1;
+ }
+
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
+
+ size += l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
+ size += MAX(64 * 1024 * 1024,
+ (size * l2arc_trim_ahead) / 100);
+ }
}
return (size);
@@ -9074,22 +9090,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
buflist = &dev->l2ad_buflist;
- /*
- * We need to add in the worst case scenario of log block overhead.
- */
- distance += l2arc_log_blk_overhead(distance, dev);
- if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
- /*
- * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
- * times the write size, whichever is greater.
- */
- distance += MAX(64 * 1024 * 1024,
- (distance * l2arc_trim_ahead) / 100);
- }
-
top:
rerun = B_FALSE;
- if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
+ if (dev->l2ad_hand + distance > dev->l2ad_end) {
/*
* When there is no space to accommodate upcoming writes,
* evict to the end. Then bump the write and evict hands
@@ -9283,7 +9286,7 @@ out:
*/
ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
if (!dev->l2ad_first)
- ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+ ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
}
}
@@ -9549,7 +9552,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
psize);
- if ((write_asize + asize) > target_sz) {
+ /*
+ * If the allocated size of this buffer plus the max
+ * size for the pending log block exceeds the evicted
+ * target size, terminate writing buffers for this run.
+ */
+ if (write_asize + asize +
+ sizeof (l2arc_log_blk_phys_t) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
@@ -9669,8 +9678,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* arcstat_l2_{size,asize} kstats are updated
* internally.
*/
- if (l2arc_log_blk_insert(dev, hdr))
- l2arc_log_blk_commit(dev, pio, cb);
+ if (l2arc_log_blk_insert(dev, hdr)) {
+ /*
+ * l2ad_hand will be adjusted in
+ * l2arc_log_blk_commit().
+ */
+ write_asize +=
+ l2arc_log_blk_commit(dev, pio, cb);
+ }
zio_nowait(wzio);
}
@@ -10820,7 +10835,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
* This function allocates some memory to temporarily hold the serialized
* buffer to be written. This is then released in l2arc_write_done.
*/
-static void
+static uint64_t
l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
{
l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
@@ -10933,6 +10948,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
dev->l2ad_log_ent_idx = 0;
dev->l2ad_log_blk_payload_asize = 0;
dev->l2ad_log_blk_payload_start = 0;
+
+ return (asize);
}
/*
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 98ca2b3bcec1..6eb1009a788b 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -71,6 +71,12 @@ int zfs_recv_write_batch_size = 1024 * 1024;
static char *dmu_recv_tag = "dmu_recv_tag";
const char *recv_clone_name = "%recv";
+typedef enum {
+ ORNS_NO,
+ ORNS_YES,
+ ORNS_MAYBE
+} or_need_sync_t;
+
static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
void *buf);
@@ -121,6 +127,9 @@ struct receive_writer_arg {
uint8_t or_iv[ZIO_DATA_IV_LEN];
uint8_t or_mac[ZIO_DATA_MAC_LEN];
boolean_t or_byteorder;
+
+ /* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */
+ or_need_sync_t or_need_sync;
};
typedef struct dmu_recv_begin_arg {
@@ -1524,17 +1533,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
}
/*
- * The dmu does not currently support decreasing nlevels
- * or changing the number of dnode slots on an object. For
- * non-raw sends, this does not matter and the new object
- * can just use the previous one's nlevels. For raw sends,
- * however, the structure of the received dnode (including
- * nlevels and dnode slots) must match that of the send
- * side. Therefore, instead of using dmu_object_reclaim(),
- * we must free the object completely and call
- * dmu_object_claim_dnsize() instead.
+ * The dmu does not currently support decreasing nlevels or changing
+ * indirect block size if there is already one, same as changing the
+ * number of of dnode slots on an object. For non-raw sends this
+ * does not matter and the new object can just use the previous one's
+ * parameters. For raw sends, however, the structure of the received
+ * dnode (including indirects and dnode slots) must match that of the
+ * send side. Therefore, instead of using dmu_object_reclaim(), we
+ * must free the object completely and call dmu_object_claim_dnsize()
+ * instead.
*/
- if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+ if ((rwa->raw && ((doi->doi_indirection > 1 &&
+ indblksz != doi->doi_metadata_block_size) ||
+ drro->drr_nlevels < doi->doi_indirection)) ||
dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_object(rwa->os, drro->drr_object);
if (err != 0)
@@ -1658,10 +1669,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
/* object was freed and we are about to allocate a new one */
object_to_hold = DMU_NEW_OBJECT;
} else {
+ /*
+ * If the only record in this range so far was DRR_FREEOBJECTS
+ * with at least one actually freed object, it's possible that
+ * the block will now be converted to a hole. We need to wait
+ * for the txg to sync to prevent races.
+ */
+ if (rwa->or_need_sync == ORNS_YES)
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+
/* object is free and we are about to allocate a new one */
object_to_hold = DMU_NEW_OBJECT;
}
+ /* Only relevant for the first object in the range */
+ rwa->or_need_sync = ORNS_NO;
+
/*
* If this is a multi-slot dnode there is a chance that this
* object will expand into a slot that is already used by
@@ -1856,6 +1879,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
+
+ if (rwa->or_need_sync == ORNS_MAYBE)
+ rwa->or_need_sync = ORNS_YES;
}
if (next_err != ESRCH)
return (next_err);
@@ -2298,6 +2324,8 @@ receive_object_range(struct receive_writer_arg *rwa,
bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
rwa->or_byteorder = byteorder;
+ rwa->or_need_sync = ORNS_MAYBE;
+
return (0);
}
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index cd9ecc07fd5c..0dd1ec210a1d 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2797,6 +2797,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
}
if (err == 0) {
+ owned = B_TRUE;
err = zap_lookup(dspp.dp->dp_meta_objset,
dspp.to_ds->ds_object,
DS_FIELD_RESUME_TOGUID, 8, 1,
@@ -2810,21 +2811,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
sizeof (dspp.saved_toname),
dspp.saved_toname);
}
- if (err != 0)
+ /* Only disown if there was an error in the lookups */
+ if (owned && (err != 0))
dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
kmem_strfree(name);
} else {
err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
FTAG, &dspp.to_ds);
+ if (err == 0)
+ owned = B_TRUE;
}
- owned = B_TRUE;
} else {
err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
&dspp.to_ds);
}
if (err != 0) {
+ /* Note: dsl dataset is not owned at this point */
dsl_pool_rele(dspp.dp, FTAG);
return (err);
}
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 1eed0526b51d..063934f39493 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -291,6 +291,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
static void
+dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
+
+ if (dn == NULL)
+ return;
+
+ /*
+ * For i/o error checking, read the blocks that will be needed
+ * to perform the append; first level-0 block (if not aligned, i.e.
+ * if they are partial-block writes), no additional blocks are read.
+ */
+ if (dn->dn_maxblkid == 0) {
+ if (off < dn->dn_datablksz &&
+ (off > 0 || len < dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ uint64_t start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+}
+
+static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
(void) zfs_refcount_add_many(&txh->txh_space_towrite,
@@ -331,6 +378,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
}
/*
+ * Should be used when appending to an object and the exact offset is unknown.
+ * The write must occur at or beyond the specified offset. Only the L0 block
+ * at provided offset will be prefetched.
+ */
+void
+dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_APPEND, off, DMU_OBJECT_END);
+ if (txh != NULL) {
+ dmu_tx_count_append(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+void
+dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END);
+ if (txh != NULL) {
+ dmu_tx_count_append(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+/*
* This function marks the transaction as being a "net free". The end
* result is that refquotas will be disabled for this transaction, and
* this transaction will be able to use half of the pool space overhead
@@ -638,6 +721,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
if (blkid == 0)
match_offset = TRUE;
break;
+ case THT_APPEND:
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
+ match_offset = TRUE;
+
+ /*
+ * THT_WRITE used for bonus and spill blocks.
+ */
+ ASSERT(blkid != DMU_BONUS_BLKID &&
+ blkid != DMU_SPILL_BLKID);
+
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
case THT_FREE:
/*
* We will dirty all the level 1 blocks in
@@ -1421,6 +1524,8 @@ dmu_tx_fini(void)
EXPORT_SYMBOL(dmu_tx_create);
EXPORT_SYMBOL(dmu_tx_hold_write);
EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_append);
+EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode);
EXPORT_SYMBOL(dmu_tx_hold_free);
EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
EXPORT_SYMBOL(dmu_tx_hold_zap);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index ed75c3bdf698..efebc443a210 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1773,7 +1773,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
}
/*
- * Checks if the dnode contains any uncommitted dirty records.
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
+ * It is important to check both conditions, as some operations (eg appending
+ * to a file) can dirty both as a single logical unit, but they are not synced
+ * out atomically, so checking one and not the other can result in an object
+ * appearing to be clean mid-way through a commit.
+ *
+ * Do not change this lightly! If you get it wrong, dmu_offset_next() can
+ * detect a hole where there is really data, leading to silent corruption.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
@@ -1781,7 +1788,8 @@ dnode_is_dirty(dnode_t *dn)
mutex_enter(&dn->dn_mtx);
for (int i = 0; i < TXG_SIZE; i++) {
- if (multilist_link_active(&dn->dn_dirty_link[i])) {
+ if (multilist_link_active(&dn->dn_dirty_link[i]) ||
+ !list_is_empty(&dn->dn_dirty_records[i])) {
mutex_exit(&dn->dn_mtx);
return (B_TRUE);
}
@@ -1891,7 +1899,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs == dn->dn_indblkshift)
ibs = 0;
- if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ if (size == dn->dn_datablksz && ibs == 0)
return (0);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@@ -1914,24 +1922,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs && dn->dn_nlevels != 1)
goto fail;
- /* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
- if (err == 0) {
- dbuf_new_size(db, size, tx);
- } else if (err != ENOENT) {
- goto fail;
- }
-
- dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (size != dn->dn_datablksz) {
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+ if (err == 0) {
+ dbuf_new_size(db, size, tx);
+ } else if (err != ENOENT) {
+ goto fail;
+ }
+
+ dnode_setdblksz(dn, size);
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+ if (db)
+ dbuf_rele(db, FTAG);
+ }
if (ibs) {
dn->dn_indblkshift = ibs;
- dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
}
- /* release after we have fixed the blocksize in the dnode */
- if (db)
- dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
return (0);
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index d5fe2ee56804..9827eb14728d 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -859,7 +859,7 @@ void
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
{
zap_cursor_t zc, pzc;
- zap_attribute_t za, pza;
+ zap_attribute_t *za, *pza;
dmu_buf_t *bonus;
dsl_deadlist_phys_t *dlp;
dmu_object_info_t doi;
@@ -874,28 +874,31 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
return;
}
+ za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ pza = kmem_alloc(sizeof (*pza), KM_SLEEP);
+
mutex_enter(&dl->dl_lock);
/*
* Prefetch up to 128 deadlists first and then more as we progress.
* The limit is a balance between ARC use and diminishing returns.
*/
for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
- (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128;
+ (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;
zap_cursor_advance(&pzc), i++) {
- dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
- zfs_strtonum(pza.za_name, NULL));
+ dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
+ zfs_strtonum(pza->za_name, NULL));
}
for (zap_cursor_init(&zc, dl->dl_os, obj);
- (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ (error = zap_cursor_retrieve(&zc, za)) == 0;
zap_cursor_advance(&zc)) {
- uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
- dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+ uint64_t mintxg = zfs_strtonum(za->za_name, NULL);
+ dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx);
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
if (perror == 0) {
- dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
- zfs_strtonum(pza.za_name, NULL));
+ dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
+ zfs_strtonum(pza->za_name, NULL));
zap_cursor_advance(&pzc);
- perror = zap_cursor_retrieve(&pzc, &pza);
+ perror = zap_cursor_retrieve(&pzc, pza);
}
}
VERIFY3U(error, ==, ENOENT);
@@ -908,6 +911,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
bzero(dlp, sizeof (*dlp));
dmu_buf_rele(bonus, FTAG);
mutex_exit(&dl->dl_lock);
+
+ kmem_free(za, sizeof (*za));
+ kmem_free(pza, sizeof (*pza));
}
/*
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index f3c639b0d04e..f0a851ff53a9 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -37,6 +37,7 @@
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
+#include <sys/arc_impl.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
@@ -126,12 +127,21 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
-static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
+static uint64_t dsl_scan_count_data_disks(spa_t *spa);
extern int zfs_vdev_async_write_active_min_dirty_percent;
static int zfs_scan_blkstats = 0;
/*
+ * 'zpool status' uses bytes processed per pass to report throughput and
+ * estimate time remaining. We define a pass to start when the scanning
+ * phase completes for a sequential resilver. Optionally, this value
+ * may be used to reset the pass statistics every N txgs to provide an
+ * estimated completion time based on currently observed performance.
+ */
+static uint_t zfs_scan_report_txgs = 0;
+
+/*
* By default zfs will check to ensure it is not over the hard memory
* limit before each txg. If finer-grained control of this is needed
* this value can be set to 1 to enable checking before scanning each
@@ -147,7 +157,7 @@ int zfs_scan_strict_mem_lim = B_FALSE;
* overload the drives with I/O, since that is protected by
* zfs_vdev_scrub_max_active.
*/
-unsigned long zfs_scan_vdev_limit = 4 << 20;
+unsigned long zfs_scan_vdev_limit = 16 << 20;
int zfs_scan_issue_strategy = 0;
int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
@@ -450,11 +460,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
/*
* Calculate the max number of in-flight bytes for pool-wide
- * scanning operations (minimum 1MB). Limits for the issuing
- * phase are done per top-level vdev and are handled separately.
+ * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+ * Limits for the issuing phase are done per top-level vdev and
+ * are handled separately.
*/
- scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
- dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+ zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
offsetof(scan_ds_t, sds_node));
@@ -584,6 +595,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
}
spa_scan_stat_init(spa);
+ vdev_scan_stat_init(spa->spa_root_vdev);
+
return (0);
}
@@ -742,6 +755,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
scn->scn_last_checkpoint = 0;
scn->scn_checkpointing = B_FALSE;
spa_scan_stat_init(spa);
+ vdev_scan_stat_init(spa->spa_root_vdev);
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
@@ -2797,8 +2811,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
}
static uint64_t
-dsl_scan_count_data_disks(vdev_t *rvd)
+dsl_scan_count_data_disks(spa_t *spa)
{
+ vdev_t *rvd = spa->spa_root_vdev;
uint64_t i, leaves = 0;
for (i = 0; i < rvd->vdev_children; i++) {
@@ -3638,6 +3653,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
/*
+ * Disabled by default, set zfs_scan_report_txgs to report
+ * average performance over the last zfs_scan_report_txgs TXGs.
+ */
+ if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 &&
+ tx->tx_txg % zfs_scan_report_txgs == 0) {
+ scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
+ spa_scan_stat_init(spa);
+ }
+
+ /*
* It is possible to switch from unsorted to sorted at any time,
* but afterwards the scan will remain sorted unless reloaded from
* a checkpoint after a reboot.
@@ -3693,12 +3718,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
taskqid_t prefetch_tqid;
/*
- * Recalculate the max number of in-flight bytes for pool-wide
- * scanning operations (minimum 1MB). Limits for the issuing
- * phase are done per top-level vdev and are handled separately.
+ * Calculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+ * Limits for the issuing phase are done per top-level vdev and
+ * are handled separately.
*/
- scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
- dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+ zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
if (scnp->scn_ddt_bookmark.ddb_class <=
scnp->scn_ddt_class_max) {
@@ -3759,6 +3785,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (scn->scn_is_sorted) {
scn->scn_checkpointing = B_TRUE;
scn->scn_clearing = B_TRUE;
+ scn->scn_issued_before_pass +=
+ spa->spa_scan_pass_issued;
+ spa_scan_stat_init(spa);
}
zfs_dbgmsg("scan complete txg %llu",
(longlong_t)tx->tx_txg);
@@ -4485,6 +4514,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW,
"Tunable to adjust bias towards more filled segments during scans");
+ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
+ "Tunable to report resilver performance over the last N txgs");
+
ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
"Process all resilvers immediately");
/* END CSTYLED */
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index f67a4eb22a2d..139bb0acd277 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -444,7 +444,7 @@ mmp_write_uberblock(spa_t *spa)
uint64_t offset;
hrtime_t lock_acquire_time = gethrtime();
- spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
+ spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 1ed79eed3e8b..81a6547896ac 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -33,6 +33,7 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
*/
/*
@@ -150,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
@@ -1110,6 +1111,275 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tqs->stqs_taskq = NULL;
}
+#ifdef _KERNEL
+/*
+ * The READ and WRITE rows of zio_taskqs are configurable at module load time
+ * by setting zio_taskq_read or zio_taskq_write.
+ *
+ * Example (the defaults for READ and WRITE)
+ * zio_taskq_read='fixed,1,8 null scale null'
+ * zio_taskq_write='batch fixed,1,5 scale fixed,1,5'
+ *
+ * Each sets the entire row at a time.
+ *
+ * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
+ * of threads per taskq.
+ *
+ * 'null' can only be set on the high-priority queues (queue selection for
+ * high-priority queues will fall back to the regular queue if the high-pri
+ * is NULL.
+ */
+static const char *const modes[ZTI_NMODES] = {
+ "fixed", "batch", "scale", "null"
+};
+
+/* Parse the incoming config string. Modifies cfg */
+static int
+spa_taskq_param_set(zio_type_t t, char *cfg)
+{
+ int err = 0;
+
+ zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
+
+ char *next = cfg, *tok, *c;
+
+ /*
+ * Parse out each element from the string and fill `row`. The entire
+ * row has to be set at once, so any errors are flagged by just
+ * breaking out of this loop early.
+ */
+ uint_t q;
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ /* `next` is the start of the config */
+ if (next == NULL)
+ break;
+
+ /* Eat up leading space */
+ while (isspace(*next))
+ next++;
+ if (*next == '\0')
+ break;
+
+ /* Mode ends at space or end of string */
+ tok = next;
+ next = strchr(tok, ' ');
+ if (next != NULL) *next++ = '\0';
+
+ /* Parameters start after a comma */
+ c = strchr(tok, ',');
+ if (c != NULL) *c++ = '\0';
+
+ /* Match mode string */
+ uint_t mode;
+ for (mode = 0; mode < ZTI_NMODES; mode++)
+ if (strcmp(tok, modes[mode]) == 0)
+ break;
+ if (mode == ZTI_NMODES)
+ break;
+
+ /* Invalid canary */
+ row[q].zti_mode = ZTI_NMODES;
+
+ /* Per-mode setup */
+ switch (mode) {
+
+ /*
+ * FIXED is parameterised: number of queues, and number of
+ * threads per queue.
+ */
+ case ZTI_MODE_FIXED: {
+ /* No parameters? */
+ if (c == NULL || *c == '\0')
+ break;
+
+ /* Find next parameter */
+ tok = c;
+ c = strchr(tok, ',');
+ if (c == NULL)
+ break;
+
+ /* Take digits and convert */
+ unsigned long long nq;
+ if (!(isdigit(*tok)))
+ break;
+ err = ddi_strtoull(tok, &tok, 10, &nq);
+ /* Must succeed and also end at the next param sep */
+ if (err != 0 || tok != c)
+ break;
+
+ /* Move past the comma */
+ tok++;
+ /* Need another number */
+ if (!(isdigit(*tok)))
+ break;
+ /* Remember start to make sure we moved */
+ c = tok;
+
+ /* Take digits */
+ unsigned long long ntpq;
+ err = ddi_strtoull(tok, &tok, 10, &ntpq);
+ /* Must succeed, and moved forward */
+ if (err != 0 || tok == c || *tok != '\0')
+ break;
+
+ /*
+ * sanity; zero queues/threads make no sense, and
+ * 16K is almost certainly more than anyone will ever
+ * need and avoids silly numbers like UINT32_MAX
+ */
+ if (nq == 0 || nq >= 16384 ||
+ ntpq == 0 || ntpq >= 16384)
+ break;
+
+ const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
+ row[q] = zti;
+ break;
+ }
+
+ case ZTI_MODE_BATCH: {
+ const zio_taskq_info_t zti = ZTI_BATCH;
+ row[q] = zti;
+ break;
+ }
+
+ case ZTI_MODE_SCALE: {
+ const zio_taskq_info_t zti = ZTI_SCALE;
+ row[q] = zti;
+ break;
+ }
+
+ case ZTI_MODE_NULL: {
+ /*
+ * Can only null the high-priority queues; the general-
+ * purpose ones have to exist.
+ */
+ if (q != ZIO_TASKQ_ISSUE_HIGH &&
+ q != ZIO_TASKQ_INTERRUPT_HIGH)
+ break;
+
+ const zio_taskq_info_t zti = ZTI_NULL;
+ row[q] = zti;
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ /* Ensure we set a mode */
+ if (row[q].zti_mode == ZTI_NMODES)
+ break;
+ }
+
+ /* Didn't get a full row, fail */
+ if (q < ZIO_TASKQ_TYPES)
+ return (SET_ERROR(EINVAL));
+
+ /* Eat trailing space */
+ if (next != NULL)
+ while (isspace(*next))
+ next++;
+
+ /* If there's anything left over then fail */
+ if (next != NULL && *next != '\0')
+ return (SET_ERROR(EINVAL));
+
+ /* Success! Copy it into the real config */
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++)
+ zio_taskqs[t][q] = row[q];
+
+ return (0);
+}
+
+static int
+spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
+{
+ int pos = 0;
+
+ /* Build paramater string from live config */
+ const char *sep = "";
+ for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ const zio_taskq_info_t *zti = &zio_taskqs[t][q];
+ if (zti->zti_mode == ZTI_MODE_FIXED)
+ pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
+ modes[zti->zti_mode], zti->zti_count,
+ zti->zti_value);
+ else
+ pos += sprintf(&buf[pos], "%s%s", sep,
+ modes[zti->zti_mode]);
+ sep = " ";
+ }
+
+ if (add_newline)
+ buf[pos++] = '\n';
+ buf[pos] = '\0';
+
+ return (pos);
+}
+
+#ifdef __linux__
+static int
+spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+ char *cfg = kmem_strdup(val);
+ int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
+ kmem_free(cfg, strlen(val)+1);
+ return (-err);
+}
+static int
+spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+ return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
+}
+
+static int
+spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+ char *cfg = kmem_strdup(val);
+ int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
+ kmem_free(cfg, strlen(val)+1);
+ return (-err);
+}
+static int
+spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+ return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
+}
+#else
+/*
+ * On FreeBSD load-time parameters can be set up before malloc() is available,
+ * so we have to do all the parsing work on the stack.
+ */
+#define SPA_TASKQ_PARAM_MAX (128)
+
+static int
+spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
+{
+ char buf[SPA_TASKQ_PARAM_MAX];
+ int err;
+
+ (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err || req->newptr == NULL)
+ return (err);
+ return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
+}
+
+static int
+spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
+{
+ char buf[SPA_TASKQ_PARAM_MAX];
+ int err;
+
+ (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err || req->newptr == NULL)
+ return (err);
+ return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
+}
+#endif
+#endif /* _KERNEL */
+
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
@@ -6261,6 +6531,16 @@ spa_tryimport(nvlist_t *tryconfig)
spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
}
+ /*
+ * spa_import() relies on a pool config fetched by spa_try_import()
+ * for spare/cache devices. Import flags are not passed to
+ * spa_tryimport(), which makes it return early due to a missing log
+ * device and missing retrieving the cache device and spare eventually.
+ * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
+ * the correct configuration regardless of the missing log device.
+ */
+ spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
+
error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
/*
@@ -6747,9 +7027,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- if (dsl_scan_resilvering(spa_get_dsl(spa)))
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
+ dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_RESILVER_IN_PROGRESS));
+ }
} else {
if (vdev_rebuild_active(rvd))
return (spa_vdev_exit(spa, NULL, txg,
@@ -6987,7 +7269,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
* Detach a device from a mirror or replacing vdev.
*
* If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
+ * is a replacing or a spare vdev.
*/
int
spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
@@ -7294,6 +7576,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
mutex_exit(&vd->vdev_initialize_lock);
return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+ vd->vdev_initialize_thread != NULL) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(EBUSY));
}
switch (cmd_type) {
@@ -7306,6 +7592,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
case POOL_INITIALIZE_SUSPEND:
vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
break;
+ case POOL_INITIALIZE_UNINIT:
+ vdev_uninitialize(vd);
+ break;
default:
panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
}
@@ -8210,7 +8499,8 @@ spa_async_thread(void *arg)
* If any devices are done replacing, detach them.
*/
if (tasks & SPA_ASYNC_RESILVER_DONE ||
- tasks & SPA_ASYNC_REBUILD_DONE) {
+ tasks & SPA_ASYNC_REBUILD_DONE ||
+ tasks & SPA_ASYNC_DETACH_SPARE) {
spa_vdev_resilver_done(spa);
}
@@ -9986,4 +10276,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT
ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
+
+#ifdef _KERNEL
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
+ spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+ "Configure IO queues for read IO");
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
+ spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+ "Configure IO queues for write IO");
+#endif
/* END CSTYLED */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index a57f0727db31..113943026d59 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -494,8 +494,9 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
return (1);
}
-void
-spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+static void
+spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
+ int mmp_flag)
{
(void) tag;
int wlocks_held = 0;
@@ -510,7 +511,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
continue;
mutex_enter(&scl->scl_lock);
if (rw == RW_READER) {
- while (scl->scl_writer || scl->scl_write_wanted) {
+ while (scl->scl_writer ||
+ (!mmp_flag && scl->scl_write_wanted)) {
cv_wait(&scl->scl_cv, &scl->scl_lock);
}
} else {
@@ -529,6 +531,27 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
}
void
+spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+ spa_config_enter_impl(spa, locks, tag, rw, 0);
+}
+
+/*
+ * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * outstanding write lock requests. This is needed since the mmp updates are
+ * time sensitive and failure to service them promptly will result in a
+ * suspended pool. This pool suspension has been seen in practice when there is
+ * a single disk in a pool that is responding slowly and presumably about to
+ * fail.
+ */
+
+void
+spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+ spa_config_enter_impl(spa, locks, tag, rw, 1);
+}
+
+void
spa_config_exit(spa_t *spa, int locks, const void *tag)
{
(void) tag;
@@ -2564,7 +2587,6 @@ spa_scan_stat_init(spa_t *spa)
spa->spa_scan_pass_scrub_spent_paused = 0;
spa->spa_scan_pass_exam = 0;
spa->spa_scan_pass_issued = 0;
- vdev_scan_stat_init(spa->spa_root_vdev);
}
/*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 4b9d7e7c0506..57259b8ce88e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -28,7 +28,7 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
- * Copyright [2021] Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
*/
#include <sys/zfs_context.h>
@@ -2646,6 +2646,17 @@ vdev_reopen(vdev_t *vd)
}
/*
+ * Recheck if resilver is still needed and cancel any
+ * scheduled resilver if resilver is unneeded.
+ */
+ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
+ spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
+ mutex_exit(&spa->spa_async_lock);
+ }
+
+ /*
* Reassess parent vdev's health.
*/
vdev_propagate_state(vd);
@@ -3983,11 +3994,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
/*
- * If the vdev is already removed, then don't do anything.
+ * If the vdev is already removed, or expanding which can trigger
+ * repartition add/remove events, then don't do anything.
*/
- if (vd->vdev_removed)
+ if (vd->vdev_removed || vd->vdev_expanding)
return (spa_vdev_state_exit(spa, NULL, 0));
+ /*
+ * Confirm the vdev has been removed, otherwise don't do anything.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_REMOVE);
@@ -4085,9 +4103,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
- vd->vdev_state >= VDEV_STATE_DEGRADED))
+ vd->vdev_state >= VDEV_STATE_DEGRADED)) {
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
+ /*
+ * Asynchronously detach spare vdev if resilver or
+ * rebuild is not required
+ */
+ if (vd->vdev_unspare &&
+ !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
+ !vdev_rebuild_active(tvd))
+ spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 8762855d46aa..9e4c115f212c 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -270,7 +270,7 @@ typedef struct indirect_split {
*/
indirect_child_t *is_good_child;
- indirect_child_t is_child[1]; /* variable-length */
+ indirect_child_t is_child[];
} indirect_split_t;
/*
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index 6ffd0d618fdd..5d90fd67cc2f 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -101,6 +101,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
}
static void
+vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx)
+{
+ uint64_t guid = *(uint64_t *)arg;
+
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE);
+ ASSERT3U(vd->vdev_leaf_zap, !=, 0);
+
+ vd->vdev_initialize_last_offset = 0;
+ vd->vdev_initialize_action_time = 0;
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ int error;
+
+ error = zap_remove(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ error = zap_remove(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ error = zap_remove(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx);
+ VERIFY(error == 0 || error == ENOENT);
+}
+
+static void
vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
{
ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
@@ -127,8 +160,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
- dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
- guid, tx);
+
+ if (new_state != VDEV_INITIALIZE_NONE) {
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, tx);
+ } else {
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_remove_sync, guid, tx);
+ }
switch (new_state) {
case VDEV_INITIALIZE_ACTIVE:
@@ -149,6 +188,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
spa_history_log_internal(spa, "initialize", tx,
"vdev=%s complete", vd->vdev_path);
break;
+ case VDEV_INITIALIZE_NONE:
+ spa_history_log_internal(spa, "uninitialize", tx,
+ "vdev=%s", vd->vdev_path);
+ break;
default:
panic("invalid state %llu", (unsigned long long)new_state);
}
@@ -605,6 +648,24 @@ vdev_initialize(vdev_t *vd)
}
/*
+ * Uninitializes a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_uninitialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE);
+}
+
+/*
* Wait for the initialize thread to be terminated (cancelled or stopped).
*/
static void
@@ -760,6 +821,7 @@ vdev_initialize_restart(vdev_t *vd)
}
EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_uninitialize);
EXPORT_SYMBOL(vdev_initialize_stop);
EXPORT_SYMBOL(vdev_initialize_stop_all);
EXPORT_SYMBOL(vdev_initialize_stop_wait);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index ec6bbc6fc610..277c14ec1ad7 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -468,6 +468,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_isspare)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+ if (flags & VDEV_CONFIG_L2CACHE)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
@@ -1100,6 +1103,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
POOL_STATE_L2CACHE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
+
+ /*
+ * This is merely to facilitate reporting the ashift of the
+ * cache device through zdb. The actual retrieval of the
+ * ashift (in vdev_alloc()) uses the nvlist
+ * spa->spa_l2cache->sav_config (populated in
+ * spa_ld_open_aux_vdevs()).
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT,
+ vd->vdev_ashift) == 0);
} else {
uint64_t txg = 0ULL;
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 9dfbe0cf6f30..b180fa14682e 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -34,6 +34,7 @@
#include <sys/zio.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
+#include <sys/arc_impl.h>
#include <sys/zap.h>
/*
@@ -116,13 +117,12 @@ unsigned long zfs_rebuild_max_segment = 1024 * 1024;
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
* the queue depth short.
*
- * 32MB was selected as the default value to achieve good performance with
- * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
- * rebuild was unable to saturate all of the drives using smaller values.
- * With a value of 32MB the sequential resilver write rate was measured at
- * 800MB/s sustained while rebuilding to a distributed spare.
+ * 64MB was observed to deliver the best performance and set as the default.
+ * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
+ * and a rebuild rate of 1.2GB/s was measured to the distribute spare.
+ * Smaller values were unable to fully saturate the available pool I/O.
*/
-unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+unsigned long zfs_rebuild_vdev_limit = 64 << 20;
/*
* Automatically start a pool scrub when the last active sequential resilver
@@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg)
{
vdev_t *vd = arg;
spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
int error = 0;
/*
@@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg)
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
- vr->vr_bytes_inflight_max = MAX(1ULL << 20,
- zfs_rebuild_vdev_limit * vd->vdev_children);
-
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@@ -805,6 +803,17 @@ vdev_rebuild_thread(void *arg)
vr->vr_scan_msp = msp;
/*
+ * Calculate the max number of in-flight bytes for top-level
+ * vdev scanning operations (minimum 1MB, maximum 1/4 of
+ * arc_c_max shared by all top-level vdevs). Limits for the
+ * issuing phase are done per top-level vdev and are handled
+ * separately.
+ */
+ uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1);
+ vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
+ zfs_rebuild_vdev_limit * vd->vdev_children));
+
+ /*
* Removal of vdevs from the vdev tree may eliminate the need
* for the rebuild, in which case it should be canceled. The
* vdev_rebuild_cancel_wanted flag is set until the sync task
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 92daed48f3d5..c0ce2ac28dc5 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -23,6 +23,7 @@
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+ * Copyright 2023 RackTop Systems, Inc.
*/
#include <sys/spa.h>
@@ -572,6 +573,7 @@ vdev_trim_ranges(trim_args_t *ta)
uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
spa_t *spa = vd->vdev_spa;
+ int error = 0;
ta->trim_start_time = gethrtime();
ta->trim_bytes_done = 0;
@@ -591,19 +593,32 @@ vdev_trim_ranges(trim_args_t *ta)
uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
for (uint64_t w = 0; w < writes_required; w++) {
- int error;
-
error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
rs_get_start(rs, ta->trim_tree) +
(w *extent_bytes_max), MIN(size -
(w * extent_bytes_max), extent_bytes_max));
if (error != 0) {
- return (error);
+ goto done;
}
}
}
- return (0);
+done:
+ /*
+ * Make sure all TRIMs for this metaslab have completed before
+ * returning. TRIM zios have lower priority over regular or syncing
+ * zios, so all TRIM zios for this metaslab must complete before the
+ * metaslab is re-enabled. Otherwise it's possible write zios to
+ * this metaslab could cut ahead of still queued TRIM zios for this
+ * metaslab causing corruption if the ranges overlap.
+ */
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[0] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ return (error);
}
static void
@@ -922,11 +937,6 @@ vdev_trim_thread(void *arg)
}
spa_config_exit(spa, SCL_CONFIG, FTAG);
- mutex_enter(&vd->vdev_trim_io_lock);
- while (vd->vdev_trim_inflight[0] > 0) {
- cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
- }
- mutex_exit(&vd->vdev_trim_io_lock);
range_tree_destroy(ta.trim_tree);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index a4b391cbea12..f441328f3018 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3985,7 +3985,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
cmd_type == POOL_INITIALIZE_START ||
- cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ cmd_type == POOL_INITIALIZE_SUSPEND ||
+ cmd_type == POOL_INITIALIZE_UNINIT)) {
return (SET_ERROR(EINVAL));
}
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index b9498d17ee2f..0987fd0f7bb7 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ atomic_inc_32(&zp->z_sync_writes_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
+ atomic_dec_32(&zp->z_sync_writes_cnt);
ZFS_EXIT(zfsvfs);
}
tsd_set(zfs_fsyncer_key, NULL);
@@ -102,7 +104,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
hole = B_FALSE;
/* Flush any mmap()'d data to disk */
- if (zn_has_cached_data(zp))
+ if (zn_has_cached_data(zp, 0, file_sz - 1))
zn_flush_cached_data(zp, B_FALSE);
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
@@ -275,7 +277,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = mappedread_sf(zp, nbytes, uio);
else
#endif
- if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+ if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
error = mappedread(zp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -686,7 +689,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_uioskip(uio, nbytes);
tx_bytes = nbytes;
}
- if (tx_bytes && zn_has_cached_data(zp) &&
+ if (tx_bytes &&
+ zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
!(ioflag & O_DIRECT)) {
update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
}
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index aaf509a2fc73..a4f7c008935d 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -226,11 +226,10 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
*/
static int
zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
- blkptr_t *nbp, void *dst, char **end)
+ blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
- arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
@@ -247,7 +246,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
- &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+ abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
@@ -262,23 +261,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
+ uint64_t size = BP_GET_LSIZE(bp);
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
- zil_chain_t *zilc = abuf->b_data;
+ zil_chain_t *zilc = (*abuf)->b_data;
char *lr = (char *)(zilc + 1);
- uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ zilc->zc_nused < sizeof (*zilc) ||
+ zilc->zc_nused > size) {
error = SET_ERROR(ECKSUM);
} else {
- ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
- bcopy(lr, dst, len);
- *end = (char *)dst + len;
+ *begin = lr;
+ *end = lr + zilc->zc_nused - sizeof (*zilc);
*nbp = zilc->zc_next_blk;
}
} else {
- char *lr = abuf->b_data;
- uint64_t size = BP_GET_LSIZE(bp);
+ char *lr = (*abuf)->b_data;
zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
@@ -286,15 +285,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
- ASSERT3U(zilc->zc_nused, <=,
- SPA_OLD_MAXBLOCKSIZE);
- bcopy(lr, dst, zilc->zc_nused);
- *end = (char *)dst + zilc->zc_nused;
+ *begin = lr;
+ *end = lr + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
}
}
-
- arc_buf_destroy(abuf, &abuf);
}
return (error);
@@ -362,7 +357,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
uint64_t blk_count = 0;
uint64_t lr_count = 0;
blkptr_t blk, next_blk;
- char *lrbuf, *lrp;
int error = 0;
bzero(&next_blk, sizeof (blkptr_t));
@@ -382,13 +376,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
int reclen;
- char *end = NULL;
+ char *lrp, *end;
+ arc_buf_t *abuf = NULL;
if (blk_seq > claim_blk_seq)
break;
@@ -404,8 +398,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
break;
error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
- lrbuf, &end);
+ &lrp, &end, &abuf);
if (error != 0) {
+ if (abuf)
+ arc_buf_destroy(abuf, &abuf);
if (claimed) {
char name[ZFS_MAX_DATASET_NAME_LEN];
@@ -418,20 +414,25 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
break;
}
- for (lrp = lrbuf; lrp < end; lrp += reclen) {
+ for (; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
- if (lr->lrc_seq > claim_lr_seq)
+ if (lr->lrc_seq > claim_lr_seq) {
+ arc_buf_destroy(abuf, &abuf);
goto done;
+ }
error = parse_lr_func(zilog, lr, arg, txg);
- if (error != 0)
+ if (error != 0) {
+ arc_buf_destroy(abuf, &abuf);
goto done;
+ }
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
lr_count++;
}
+ arc_buf_destroy(abuf, &abuf);
}
done:
zilog->zl_parse_error = error;
@@ -441,7 +442,6 @@ done:
zilog->zl_parse_lr_count = lr_count;
zil_bp_tree_fini(zilog);
- zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -1593,6 +1593,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
ASSERT3U(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz);
+ wsz = lwb->lwb_write_zio->io_size;
} else {
wsz = lwb->lwb_sz;
@@ -2848,7 +2849,14 @@ static void
zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ /*
+ * Since we are not going to create any new dirty data, and we
+ * can even help with clearing the existing dirty data, we
+ * should not be subject to the dirty data based delays. We
+ * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ */
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
itx->itx_sync = B_TRUE;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 700f8791045f..c367ef7211aa 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2287,7 +2287,7 @@ zio_nowait(zio_t *zio)
ASSERT3P(zio->io_executor, ==, NULL);
if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
- zio_unique_parent(zio) == NULL) {
+ list_is_empty(&zio->io_parent_list)) {
zio_t *pio;
/*