diff options
Diffstat (limited to 'module/zfs')
| -rw-r--r-- | module/zfs/Makefile.in | 9 | ||||
| -rw-r--r-- | module/zfs/abd.c | 17 | ||||
| -rw-r--r-- | module/zfs/arc.c | 73 | ||||
| -rw-r--r-- | module/zfs/dmu_recv.c | 48 | ||||
| -rw-r--r-- | module/zfs/dmu_send.c | 8 | ||||
| -rw-r--r-- | module/zfs/dmu_tx.c | 105 | ||||
| -rw-r--r-- | module/zfs/dnode.c | 43 | ||||
| -rw-r--r-- | module/zfs/dsl_deadlist.c | 26 | ||||
| -rw-r--r-- | module/zfs/dsl_scan.c | 56 | ||||
| -rw-r--r-- | module/zfs/mmp.c | 2 | ||||
| -rw-r--r-- | module/zfs/spa.c | 307 | ||||
| -rw-r--r-- | module/zfs/spa_misc.c | 30 | ||||
| -rw-r--r-- | module/zfs/vdev.c | 36 | ||||
| -rw-r--r-- | module/zfs/vdev_indirect.c | 2 | ||||
| -rw-r--r-- | module/zfs/vdev_initialize.c | 66 | ||||
| -rw-r--r-- | module/zfs/vdev_label.c | 13 | ||||
| -rw-r--r-- | module/zfs/vdev_rebuild.c | 27 | ||||
| -rw-r--r-- | module/zfs/vdev_trim.c | 28 | ||||
| -rw-r--r-- | module/zfs/zfs_ioctl.c | 3 | ||||
| -rw-r--r-- | module/zfs/zfs_vnops.c | 10 | ||||
| -rw-r--r-- | module/zfs/zil.c | 60 | ||||
| -rw-r--r-- | module/zfs/zio.c | 2 | 
22 files changed, 825 insertions, 146 deletions
| diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 653ea0da9bcc..d9b86890b5f5 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -154,4 +154,13 @@ ifeq ($(CONFIG_ALTIVEC),y)  $(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec  endif +ifeq ($(CONFIG_ARM64),y) +CFLAGS_REMOVE_vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only +CFLAGS_REMOVE_vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only +endif + +UBSAN_SANITIZE_zap_leaf.o := n +UBSAN_SANITIZE_zap_micro.o := n +UBSAN_SANITIZE_sa.o := n +  include $(mfdir)/../os/linux/zfs/Makefile diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 8ee8e7e57420..754974a559b6 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -109,7 +109,6 @@ void  abd_verify(abd_t *abd)  {  #ifdef ZFS_DEBUG -	ASSERT3U(abd->abd_size, >, 0);  	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);  	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |  	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | @@ -118,6 +117,7 @@ abd_verify(abd_t *abd)  	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));  	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);  	if (abd_is_linear(abd)) { +		ASSERT3U(abd->abd_size, >, 0);  		ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);  	} else if (abd_is_gang(abd)) {  		uint_t child_sizes = 0; @@ -130,6 +130,7 @@ abd_verify(abd_t *abd)  		}  		ASSERT3U(abd->abd_size, ==, child_sizes);  	} else { +		ASSERT3U(abd->abd_size, >, 0);  		abd_verify_scatter(abd);  	}  #endif @@ -369,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)  		 * will retain all the free_on_free settings after being  		 * added to the parents list.  		 */ +#ifdef ZFS_DEBUG +		/* +		 * If cabd had abd_parent, we have to drop it here.  We can't +		 * transfer it to pabd, nor we can clear abd_size leaving it. +		 */ +		if (cabd->abd_parent != NULL) { +			(void) zfs_refcount_remove_many( +			    &cabd->abd_parent->abd_children, +			    cabd->abd_size, cabd); +			cabd->abd_parent = NULL; +		} +#endif  		pabd->abd_size += cabd->abd_size; +		cabd->abd_size = 0;  		list_move_tail(&ABD_GANG(pabd).abd_gang_chain,  		    &ABD_GANG(cabd).abd_gang_chain);  		ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); @@ -407,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)  	 */  	if (abd_is_gang(cabd)) {  		ASSERT(!list_link_active(&cabd->abd_gang_link)); -		ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));  		return (abd_gang_add_gang(pabd, cabd, free_on_free));  	}  	ASSERT(!abd_is_gang(cabd)); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6900b6b134d9..1180853da038 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -946,7 +946,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,      l2arc_dev_t *dev);  /* L2ARC persistence write I/O routines. */ -static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,      l2arc_write_callback_t *cb);  /* L2ARC persistence auxiliary routines. */ @@ -8415,7 +8415,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)  static uint64_t  l2arc_write_size(l2arc_dev_t *dev)  { -	uint64_t size, dev_size, tsize; +	uint64_t size;  	/*  	 * Make sure our globals have meaningful values in case the user @@ -8432,18 +8432,23 @@ l2arc_write_size(l2arc_dev_t *dev)  	if (arc_warm == B_FALSE)  		size += l2arc_write_boost; +	/* We need to add in the worst case scenario of log block overhead. */ +	size += l2arc_log_blk_overhead(size, dev); +	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { +		/* +		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) +		 * times the writesize, whichever is greater. +		 */ +		size += MAX(64 * 1024 * 1024, +		    (size * l2arc_trim_ahead) / 100); +	} +  	/*  	 * Make sure the write size does not exceed the size of the cache  	 * device. This is important in l2arc_evict(), otherwise infinite  	 * iteration can occur.  	 */ -	dev_size = dev->l2ad_end - dev->l2ad_start; -	tsize = size + l2arc_log_blk_overhead(size, dev); -	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) -		tsize += MAX(64 * 1024 * 1024, -		    (tsize * l2arc_trim_ahead) / 100); - -	if (tsize >= dev_size) { +	if (size > dev->l2ad_end - dev->l2ad_start) {  		cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "  		    "plus the overhead of log blocks (persistent L2ARC, "  		    "%llu bytes) exceeds the size of the cache device " @@ -8452,8 +8457,19 @@ l2arc_write_size(l2arc_dev_t *dev)  		    dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);  		size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; +		if (l2arc_trim_ahead > 1) { +			cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); +			l2arc_trim_ahead = 1; +		} +  		if (arc_warm == B_FALSE)  			size += l2arc_write_boost; + +		size += l2arc_log_blk_overhead(size, dev); +		if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { +			size += MAX(64 * 1024 * 1024, +			    (size * l2arc_trim_ahead) / 100); +		}  	}  	return (size); @@ -9074,22 +9090,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)  	buflist = &dev->l2ad_buflist; -	/* -	 * We need to add in the worst case scenario of log block overhead. -	 */ -	distance += l2arc_log_blk_overhead(distance, dev); -	if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { -		/* -		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) -		 * times the write size, whichever is greater. -		 */ -		distance += MAX(64 * 1024 * 1024, -		    (distance * l2arc_trim_ahead) / 100); -	} -  top:  	rerun = B_FALSE; -	if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { +	if (dev->l2ad_hand + distance > dev->l2ad_end) {  		/*  		 * When there is no space to accommodate upcoming writes,  		 * evict to the end. Then bump the write and evict hands @@ -9283,7 +9286,7 @@ out:  		 */  		ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);  		if (!dev->l2ad_first) -			ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); +			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);  	}  } @@ -9549,7 +9552,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)  			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,  			    psize); -			if ((write_asize + asize) > target_sz) { +			/* +			 * If the allocated size of this buffer plus the max +			 * size for the pending log block exceeds the evicted +			 * target size, terminate writing buffers for this run. +			 */ +			if (write_asize + asize + +			    sizeof (l2arc_log_blk_phys_t) > target_sz) {  				full = B_TRUE;  				mutex_exit(hash_lock);  				break; @@ -9669,8 +9678,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)  			 * arcstat_l2_{size,asize} kstats are updated  			 * internally.  			 */ -			if (l2arc_log_blk_insert(dev, hdr)) -				l2arc_log_blk_commit(dev, pio, cb); +			if (l2arc_log_blk_insert(dev, hdr)) { +				/* +				 * l2ad_hand will be adjusted in +				 * l2arc_log_blk_commit(). +				 */ +				write_asize += +				    l2arc_log_blk_commit(dev, pio, cb); +			}  			zio_nowait(wzio);  		} @@ -10820,7 +10835,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)   * This function allocates some memory to temporarily hold the serialized   * buffer to be written. This is then released in l2arc_write_done.   */ -static void +static uint64_t  l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)  {  	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk; @@ -10933,6 +10948,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)  	dev->l2ad_log_ent_idx = 0;  	dev->l2ad_log_blk_payload_asize = 0;  	dev->l2ad_log_blk_payload_start = 0; + +	return (asize);  }  /* diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 98ca2b3bcec1..6eb1009a788b 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -71,6 +71,12 @@ int zfs_recv_write_batch_size = 1024 * 1024;  static char *dmu_recv_tag = "dmu_recv_tag";  const char *recv_clone_name = "%recv"; +typedef enum { +	ORNS_NO, +	ORNS_YES, +	ORNS_MAYBE +} or_need_sync_t; +  static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,      void *buf); @@ -121,6 +127,9 @@ struct receive_writer_arg {  	uint8_t or_iv[ZIO_DATA_IV_LEN];  	uint8_t or_mac[ZIO_DATA_MAC_LEN];  	boolean_t or_byteorder; + +	/* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */ +	or_need_sync_t or_need_sync;  };  typedef struct dmu_recv_begin_arg { @@ -1524,17 +1533,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,  	}  	/* -	 * The dmu does not currently support decreasing nlevels -	 * or changing the number of dnode slots on an object. For -	 * non-raw sends, this does not matter and the new object -	 * can just use the previous one's nlevels. For raw sends, -	 * however, the structure of the received dnode (including -	 * nlevels and dnode slots) must match that of the send -	 * side. Therefore, instead of using dmu_object_reclaim(), -	 * we must free the object completely and call -	 * dmu_object_claim_dnsize() instead. +	 * The dmu does not currently support decreasing nlevels or changing +	 * indirect block size if there is already one, same as changing the +	 * number of of dnode slots on an object.  For non-raw sends this +	 * does not matter and the new object can just use the previous one's +	 * parameters.  For raw sends, however, the structure of the received +	 * dnode (including indirects and dnode slots) must match that of the +	 * send side.  Therefore, instead of using dmu_object_reclaim(), we +	 * must free the object completely and call dmu_object_claim_dnsize() +	 * instead.  	 */ -	if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || +	if ((rwa->raw && ((doi->doi_indirection > 1 && +	    indblksz != doi->doi_metadata_block_size) || +	    drro->drr_nlevels < doi->doi_indirection)) ||  	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {  		err = dmu_free_long_object(rwa->os, drro->drr_object);  		if (err != 0) @@ -1658,10 +1669,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,  		/* object was freed and we are about to allocate a new one */  		object_to_hold = DMU_NEW_OBJECT;  	} else { +		/* +		 * If the only record in this range so far was DRR_FREEOBJECTS +		 * with at least one actually freed object, it's possible that +		 * the block will now be converted to a hole. We need to wait +		 * for the txg to sync to prevent races. +		 */ +		if (rwa->or_need_sync == ORNS_YES) +			txg_wait_synced(dmu_objset_pool(rwa->os), 0); +  		/* object is free and we are about to allocate a new one */  		object_to_hold = DMU_NEW_OBJECT;  	} +	/* Only relevant for the first object in the range */ +	rwa->or_need_sync = ORNS_NO; +  	/*  	 * If this is a multi-slot dnode there is a chance that this  	 * object will expand into a slot that is already used by @@ -1856,6 +1879,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,  		if (err != 0)  			return (err); + +		if (rwa->or_need_sync == ORNS_MAYBE) +			rwa->or_need_sync = ORNS_YES;  	}  	if (next_err != ESRCH)  		return (next_err); @@ -2298,6 +2324,8 @@ receive_object_range(struct receive_writer_arg *rwa,  	bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);  	rwa->or_byteorder = byteorder; +	rwa->or_need_sync = ORNS_MAYBE; +  	return (0);  } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index cd9ecc07fd5c..0dd1ec210a1d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2797,6 +2797,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,  			}  			if (err == 0) { +				owned = B_TRUE;  				err = zap_lookup(dspp.dp->dp_meta_objset,  				    dspp.to_ds->ds_object,  				    DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2810,21 +2811,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,  				    sizeof (dspp.saved_toname),  				    dspp.saved_toname);  			} -			if (err != 0) +			/* Only disown if there was an error in the lookups */ +			if (owned && (err != 0))  				dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);  			kmem_strfree(name);  		} else {  			err = dsl_dataset_own(dspp.dp, tosnap, dsflags,  			    FTAG, &dspp.to_ds); +			if (err == 0) +				owned = B_TRUE;  		} -		owned = B_TRUE;  	} else {  		err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,  		    &dspp.to_ds);  	}  	if (err != 0) { +		/* Note: dsl dataset is not owned at this point */  		dsl_pool_rele(dspp.dp, FTAG);  		return (err);  	} diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1eed0526b51d..063934f39493 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -291,6 +291,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)  }  static void +dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ +	dnode_t *dn = txh->txh_dnode; +	int err = 0; + +	if (len == 0) +		return; + +	(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + +	if (dn == NULL) +		return; + +	/* +	 * For i/o error checking, read the blocks that will be needed +	 * to perform the append; first level-0 block (if not aligned, i.e. +	 * if they are partial-block writes), no additional blocks are read. +	 */ +	if (dn->dn_maxblkid == 0) { +		if (off < dn->dn_datablksz && +		    (off > 0 || len < dn->dn_datablksz)) { +			err = dmu_tx_check_ioerr(NULL, dn, 0, 0); +			if (err != 0) { +				txh->txh_tx->tx_err = err; +			} +		} +	} else { +		zio_t *zio = zio_root(dn->dn_objset->os_spa, +		    NULL, NULL, ZIO_FLAG_CANFAIL); + +		/* first level-0 block */ +		uint64_t start = off >> dn->dn_datablkshift; +		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { +			err = dmu_tx_check_ioerr(zio, dn, 0, start); +			if (err != 0) { +				txh->txh_tx->tx_err = err; +			} +		} + +		err = zio_wait(zio); +		if (err != 0) { +			txh->txh_tx->tx_err = err; +		} +	} +} + +static void  dmu_tx_count_dnode(dmu_tx_hold_t *txh)  {  	(void) zfs_refcount_add_many(&txh->txh_space_towrite, @@ -331,6 +378,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)  }  /* + * Should be used when appending to an object and the exact offset is unknown. + * The write must occur at or beyond the specified offset.  Only the L0 block + * at provided offset will be prefetched. + */ +void +dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ +	dmu_tx_hold_t *txh; + +	ASSERT0(tx->tx_txg); +	ASSERT3U(len, <=, DMU_MAX_ACCESS); + +	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, +	    object, THT_APPEND, off, DMU_OBJECT_END); +	if (txh != NULL) { +		dmu_tx_count_append(txh, off, len); +		dmu_tx_count_dnode(txh); +	} +} + +void +dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ +	dmu_tx_hold_t *txh; + +	ASSERT0(tx->tx_txg); +	ASSERT3U(len, <=, DMU_MAX_ACCESS); + +	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); +	if (txh != NULL) { +		dmu_tx_count_append(txh, off, len); +		dmu_tx_count_dnode(txh); +	} +} + +/*   * This function marks the transaction as being a "net free".  The end   * result is that refquotas will be disabled for this transaction, and   * this transaction will be able to use half of the pool space overhead @@ -638,6 +721,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)  				if (blkid == 0)  					match_offset = TRUE;  				break; +			case THT_APPEND: +				if (blkid >= beginblk && (blkid <= endblk || +				    txh->txh_arg2 == DMU_OBJECT_END)) +					match_offset = TRUE; + +				/* +				 * THT_WRITE used for bonus and spill blocks. +				 */ +				ASSERT(blkid != DMU_BONUS_BLKID && +				    blkid != DMU_SPILL_BLKID); + +				/* +				 * They might have to increase nlevels, +				 * thus dirtying the new TLIBs.  Or the +				 * might have to change the block size, +				 * thus dirying the new lvl=0 blk=0. +				 */ +				if (blkid == 0) +					match_offset = TRUE; +				break;  			case THT_FREE:  				/*  				 * We will dirty all the level 1 blocks in @@ -1421,6 +1524,8 @@ dmu_tx_fini(void)  EXPORT_SYMBOL(dmu_tx_create);  EXPORT_SYMBOL(dmu_tx_hold_write);  EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_append); +EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode);  EXPORT_SYMBOL(dmu_tx_hold_free);  EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);  EXPORT_SYMBOL(dmu_tx_hold_zap); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ed75c3bdf698..efebc443a210 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1773,7 +1773,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)  }  /* - * Checks if the dnode contains any uncommitted dirty records. + * Checks if the dnode itself is dirty, or is carrying any uncommitted records. + * It is important to check both conditions, as some operations (eg appending + * to a file) can dirty both as a single logical unit, but they are not synced + * out atomically, so checking one and not the other can result in an object + * appearing to be clean mid-way through a commit. + * + * Do not change this lightly! If you get it wrong, dmu_offset_next() can + * detect a hole where there is really data, leading to silent corruption.   */  boolean_t  dnode_is_dirty(dnode_t *dn) @@ -1781,7 +1788,8 @@ dnode_is_dirty(dnode_t *dn)  	mutex_enter(&dn->dn_mtx);  	for (int i = 0; i < TXG_SIZE; i++) { -		if (multilist_link_active(&dn->dn_dirty_link[i])) { +		if (multilist_link_active(&dn->dn_dirty_link[i]) || +		    !list_is_empty(&dn->dn_dirty_records[i])) {  			mutex_exit(&dn->dn_mtx);  			return (B_TRUE);  		} @@ -1891,7 +1899,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)  	if (ibs == dn->dn_indblkshift)  		ibs = 0; -	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) +	if (size == dn->dn_datablksz && ibs == 0)  		return (0);  	rw_enter(&dn->dn_struct_rwlock, RW_WRITER); @@ -1914,24 +1922,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)  	if (ibs && dn->dn_nlevels != 1)  		goto fail; -	/* resize the old block */ -	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); -	if (err == 0) { -		dbuf_new_size(db, size, tx); -	} else if (err != ENOENT) { -		goto fail; -	} - -	dnode_setdblksz(dn, size);  	dnode_setdirty(dn, tx); -	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; +	if (size != dn->dn_datablksz) { +		/* resize the old block */ +		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); +		if (err == 0) { +			dbuf_new_size(db, size, tx); +		} else if (err != ENOENT) { +			goto fail; +		} + +		dnode_setdblksz(dn, size); +		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size; +		if (db) +			dbuf_rele(db, FTAG); +	}  	if (ibs) {  		dn->dn_indblkshift = ibs; -		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; +		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;  	} -	/* release after we have fixed the blocksize in the dnode */ -	if (db) -		dbuf_rele(db, FTAG);  	rw_exit(&dn->dn_struct_rwlock);  	return (0); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index d5fe2ee56804..9827eb14728d 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -859,7 +859,7 @@ void  dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)  {  	zap_cursor_t zc, pzc; -	zap_attribute_t za, pza; +	zap_attribute_t *za, *pza;  	dmu_buf_t *bonus;  	dsl_deadlist_phys_t *dlp;  	dmu_object_info_t doi; @@ -874,28 +874,31 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)  		return;  	} +	za = kmem_alloc(sizeof (*za), KM_SLEEP); +	pza = kmem_alloc(sizeof (*pza), KM_SLEEP); +  	mutex_enter(&dl->dl_lock);  	/*  	 * Prefetch up to 128 deadlists first and then more as we progress.  	 * The limit is a balance between ARC use and diminishing returns.  	 */  	for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; -	    (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128; +	    (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;  	    zap_cursor_advance(&pzc), i++) { -		dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, -		    zfs_strtonum(pza.za_name, NULL)); +		dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, +		    zfs_strtonum(pza->za_name, NULL));  	}  	for (zap_cursor_init(&zc, dl->dl_os, obj); -	    (error = zap_cursor_retrieve(&zc, &za)) == 0; +	    (error = zap_cursor_retrieve(&zc, za)) == 0;  	    zap_cursor_advance(&zc)) { -		uint64_t mintxg = zfs_strtonum(za.za_name, NULL); -		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); +		uint64_t mintxg = zfs_strtonum(za->za_name, NULL); +		dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx);  		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));  		if (perror == 0) { -			dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, -			    zfs_strtonum(pza.za_name, NULL)); +			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, +			    zfs_strtonum(pza->za_name, NULL));  			zap_cursor_advance(&pzc); -			perror = zap_cursor_retrieve(&pzc, &pza); +			perror = zap_cursor_retrieve(&pzc, pza);  		}  	}  	VERIFY3U(error, ==, ENOENT); @@ -908,6 +911,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)  	bzero(dlp, sizeof (*dlp));  	dmu_buf_rele(bonus, FTAG);  	mutex_exit(&dl->dl_lock); + +	kmem_free(za, sizeof (*za)); +	kmem_free(pza, sizeof (*pza));  }  /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f3c639b0d04e..f0a851ff53a9 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -37,6 +37,7 @@  #include <sys/dmu_tx.h>  #include <sys/dmu_objset.h>  #include <sys/arc.h> +#include <sys/arc_impl.h>  #include <sys/zap.h>  #include <sys/zio.h>  #include <sys/zfs_context.h> @@ -126,12 +127,21 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,  static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);  static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);  static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_data_disks(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(spa_t *spa);  extern int zfs_vdev_async_write_active_min_dirty_percent;  static int zfs_scan_blkstats = 0;  /* + * 'zpool status' uses bytes processed per pass to report throughput and + * estimate time remaining.  We define a pass to start when the scanning + * phase completes for a sequential resilver.  Optionally, this value + * may be used to reset the pass statistics every N txgs to provide an + * estimated completion time based on currently observed performance. + */ +static uint_t zfs_scan_report_txgs = 0; + +/*   * By default zfs will check to ensure it is not over the hard memory   * limit before each txg. If finer-grained control of this is needed   * this value can be set to 1 to enable checking before scanning each @@ -147,7 +157,7 @@ int zfs_scan_strict_mem_lim = B_FALSE;   * overload the drives with I/O, since that is protected by   * zfs_vdev_scrub_max_active.   */ -unsigned long zfs_scan_vdev_limit = 4 << 20; +unsigned long zfs_scan_vdev_limit = 16 << 20;  int zfs_scan_issue_strategy = 0;  int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ @@ -450,11 +460,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)  	/*  	 * Calculate the max number of in-flight bytes for pool-wide -	 * scanning operations (minimum 1MB). Limits for the issuing -	 * phase are done per top-level vdev and are handled separately. +	 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). +	 * Limits for the issuing phase are done per top-level vdev and +	 * are handled separately.  	 */ -	scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * -	    dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); +	scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, +	    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));  	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),  	    offsetof(scan_ds_t, sds_node)); @@ -584,6 +595,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)  	}  	spa_scan_stat_init(spa); +	vdev_scan_stat_init(spa->spa_root_vdev); +  	return (0);  } @@ -742,6 +755,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)  	scn->scn_last_checkpoint = 0;  	scn->scn_checkpointing = B_FALSE;  	spa_scan_stat_init(spa); +	vdev_scan_stat_init(spa->spa_root_vdev);  	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {  		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; @@ -2797,8 +2811,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)  }  static uint64_t -dsl_scan_count_data_disks(vdev_t *rvd) +dsl_scan_count_data_disks(spa_t *spa)  { +	vdev_t *rvd = spa->spa_root_vdev;  	uint64_t i, leaves = 0;  	for (i = 0; i < rvd->vdev_children; i++) { @@ -3638,6 +3653,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)  	}  	/* +	 * Disabled by default, set zfs_scan_report_txgs to report +	 * average performance over the last zfs_scan_report_txgs TXGs. +	 */ +	if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && +	    tx->tx_txg % zfs_scan_report_txgs == 0) { +		scn->scn_issued_before_pass += spa->spa_scan_pass_issued; +		spa_scan_stat_init(spa); +	} + +	/*  	 * It is possible to switch from unsorted to sorted at any time,  	 * but afterwards the scan will remain sorted unless reloaded from  	 * a checkpoint after a reboot. @@ -3693,12 +3718,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)  		taskqid_t prefetch_tqid;  		/* -		 * Recalculate the max number of in-flight bytes for pool-wide -		 * scanning operations (minimum 1MB). Limits for the issuing -		 * phase are done per top-level vdev and are handled separately. +		 * Calculate the max number of in-flight bytes for pool-wide +		 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). +		 * Limits for the issuing phase are done per top-level vdev and +		 * are handled separately.  		 */ -		scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * -		    dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); +		scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, +		    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));  		if (scnp->scn_ddt_bookmark.ddb_class <=  		    scnp->scn_ddt_class_max) { @@ -3759,6 +3785,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)  			if (scn->scn_is_sorted) {  				scn->scn_checkpointing = B_TRUE;  				scn->scn_clearing = B_TRUE; +				scn->scn_issued_before_pass += +				    spa->spa_scan_pass_issued; +				spa_scan_stat_init(spa);  			}  			zfs_dbgmsg("scan complete txg %llu",  			    (longlong_t)tx->tx_txg); @@ -4485,6 +4514,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,  ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW,  	"Tunable to adjust bias towards more filled segments during scans"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, +	"Tunable to report resilver performance over the last N txgs"); +  ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,  	"Process all resilvers immediately");  /* END CSTYLED */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index f67a4eb22a2d..139bb0acd277 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -444,7 +444,7 @@ mmp_write_uberblock(spa_t *spa)  	uint64_t offset;  	hrtime_t lock_acquire_time = gethrtime(); -	spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); +	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);  	lock_acquire_time = gethrtime() - lock_acquire_time;  	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))  		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1ed79eed3e8b..81a6547896ac 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -33,6 +33,7 @@   * Copyright 2017 Joyent, Inc.   * Copyright (c) 2017, Intel Corporation.   * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.   */  /* @@ -150,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {   * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that   * need to be handled with minimum delay.   */ -const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {  	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */  	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */  	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */ @@ -1110,6 +1111,275 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)  	tqs->stqs_taskq = NULL;  } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + *   zio_taskq_read='fixed,1,8 null scale null' + *   zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { +	"fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ +	int err = 0; + +	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + +	char *next = cfg, *tok, *c; + +	/* +	 * Parse out each element from the string and fill `row`. The entire +	 * row has to be set at once, so any errors are flagged by just +	 * breaking out of this loop early. +	 */ +	uint_t q; +	for (q = 0; q < ZIO_TASKQ_TYPES; q++) { +		/* `next` is the start of the config */ +		if (next == NULL) +			break; + +		/* Eat up leading space */ +		while (isspace(*next)) +			next++; +		if (*next == '\0') +			break; + +		/* Mode ends at space or end of string */ +		tok = next; +		next = strchr(tok, ' '); +		if (next != NULL) *next++ = '\0'; + +		/* Parameters start after a comma */ +		c = strchr(tok, ','); +		if (c != NULL) *c++ = '\0'; + +		/* Match mode string */ +		uint_t mode; +		for (mode = 0; mode < ZTI_NMODES; mode++) +			if (strcmp(tok, modes[mode]) == 0) +				break; +		if (mode == ZTI_NMODES) +			break; + +		/* Invalid canary */ +		row[q].zti_mode = ZTI_NMODES; + +		/* Per-mode setup */ +		switch (mode) { + +		/* +		 * FIXED is parameterised: number of queues, and number of +		 * threads per queue. +		 */ +		case ZTI_MODE_FIXED: { +			/* No parameters? */ +			if (c == NULL || *c == '\0') +				break; + +			/* Find next parameter */ +			tok = c; +			c = strchr(tok, ','); +			if (c == NULL) +				break; + +			/* Take digits and convert */ +			unsigned long long nq; +			if (!(isdigit(*tok))) +				break; +			err = ddi_strtoull(tok, &tok, 10, &nq); +			/* Must succeed and also end at the next param sep */ +			if (err != 0 || tok != c) +				break; + +			/* Move past the comma */ +			tok++; +			/* Need another number */ +			if (!(isdigit(*tok))) +				break; +			/* Remember start to make sure we moved */ +			c = tok; + +			/* Take digits */ +			unsigned long long ntpq; +			err = ddi_strtoull(tok, &tok, 10, &ntpq); +			/* Must succeed, and moved forward */ +			if (err != 0 || tok == c || *tok != '\0') +				break; + +			/* +			 * sanity; zero queues/threads make no sense, and +			 * 16K is almost certainly more than anyone will ever +			 * need and avoids silly numbers like UINT32_MAX +			 */ +			if (nq == 0 || nq >= 16384 || +			    ntpq == 0 || ntpq >= 16384) +				break; + +			const zio_taskq_info_t zti = ZTI_P(ntpq, nq); +			row[q] = zti; +			break; +		} + +		case ZTI_MODE_BATCH: { +			const zio_taskq_info_t zti = ZTI_BATCH; +			row[q] = zti; +			break; +		} + +		case ZTI_MODE_SCALE: { +			const zio_taskq_info_t zti = ZTI_SCALE; +			row[q] = zti; +			break; +		} + +		case ZTI_MODE_NULL: { +			/* +			 * Can only null the high-priority queues; the general- +			 * purpose ones have to exist. +			 */ +			if (q != ZIO_TASKQ_ISSUE_HIGH && +			    q != ZIO_TASKQ_INTERRUPT_HIGH) +				break; + +			const zio_taskq_info_t zti = ZTI_NULL; +			row[q] = zti; +			break; +		} + +		default: +			break; +		} + +		/* Ensure we set a mode */ +		if (row[q].zti_mode == ZTI_NMODES) +			break; +	} + +	/* Didn't get a full row, fail */ +	if (q < ZIO_TASKQ_TYPES) +		return (SET_ERROR(EINVAL)); + +	/* Eat trailing space */ +	if (next != NULL) +		while (isspace(*next)) +			next++; + +	/* If there's anything left over then fail */ +	if (next != NULL && *next != '\0') +		return (SET_ERROR(EINVAL)); + +	/* Success! Copy it into the real config */ +	for (q = 0; q < ZIO_TASKQ_TYPES; q++) +		zio_taskqs[t][q] = row[q]; + +	return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) +{ +	int pos = 0; + +	/* Build paramater string from live config */ +	const char *sep = ""; +	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { +		const zio_taskq_info_t *zti = &zio_taskqs[t][q]; +		if (zti->zti_mode == ZTI_MODE_FIXED) +			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, +			    modes[zti->zti_mode], zti->zti_count, +			    zti->zti_value); +		else +			pos += sprintf(&buf[pos], "%s%s", sep, +			    modes[zti->zti_mode]); +		sep = " "; +	} + +	if (add_newline) +		buf[pos++] = '\n'; +	buf[pos] = '\0'; + +	return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ +	char *cfg = kmem_strdup(val); +	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); +	kmem_free(cfg, strlen(val)+1); +	return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ +	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ +	char *cfg = kmem_strdup(val); +	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); +	kmem_free(cfg, strlen(val)+1); +	return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ +	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); +} +#else +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define	SPA_TASKQ_PARAM_MAX	(128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ +	char buf[SPA_TASKQ_PARAM_MAX]; +	int err; + +	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); +	err = sysctl_handle_string(oidp, buf, sizeof (buf), req); +	if (err || req->newptr == NULL) +		return (err); +	return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ +	char buf[SPA_TASKQ_PARAM_MAX]; +	int err; + +	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); +	err = sysctl_handle_string(oidp, buf, sizeof (buf), req); +	if (err || req->newptr == NULL) +		return (err); +	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ +  /*   * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.   * Note that a type may have multiple discrete taskqs to avoid lock contention @@ -6261,6 +6531,16 @@ spa_tryimport(nvlist_t *tryconfig)  		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;  	} +	/* +	 * spa_import() relies on a pool config fetched by spa_try_import() +	 * for spare/cache devices. Import flags are not passed to +	 * spa_tryimport(), which makes it return early due to a missing log +	 * device and missing retrieving the cache device and spare eventually. +	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch +	 * the correct configuration regardless of the missing log device. +	 */ +	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; +  	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);  	/* @@ -6747,9 +7027,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,  		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))  			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); -		if (dsl_scan_resilvering(spa_get_dsl(spa))) +		if (dsl_scan_resilvering(spa_get_dsl(spa)) || +		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {  			return (spa_vdev_exit(spa, NULL, txg,  			    ZFS_ERR_RESILVER_IN_PROGRESS)); +		}  	} else {  		if (vdev_rebuild_active(rvd))  			return (spa_vdev_exit(spa, NULL, txg, @@ -6987,7 +7269,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,   * Detach a device from a mirror or replacing vdev.   *   * If 'replace_done' is specified, only detach if the parent - * is a replacing vdev. + * is a replacing or a spare vdev.   */  int  spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) @@ -7294,6 +7576,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,  	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {  		mutex_exit(&vd->vdev_initialize_lock);  		return (SET_ERROR(ESRCH)); +	} else if (cmd_type == POOL_INITIALIZE_UNINIT && +	    vd->vdev_initialize_thread != NULL) { +		mutex_exit(&vd->vdev_initialize_lock); +		return (SET_ERROR(EBUSY));  	}  	switch (cmd_type) { @@ -7306,6 +7592,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,  	case POOL_INITIALIZE_SUSPEND:  		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);  		break; +	case POOL_INITIALIZE_UNINIT: +		vdev_uninitialize(vd); +		break;  	default:  		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);  	} @@ -8210,7 +8499,8 @@ spa_async_thread(void *arg)  	 * If any devices are done replacing, detach them.  	 */  	if (tasks & SPA_ASYNC_RESILVER_DONE || -	    tasks & SPA_ASYNC_REBUILD_DONE) { +	    tasks & SPA_ASYNC_REBUILD_DONE || +	    tasks & SPA_ASYNC_DETACH_SPARE) {  		spa_vdev_resilver_done(spa);  	} @@ -9986,4 +10276,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT  ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,  	"Whether extra ALLOC blkptrs were added to a livelist entry while it "  	"was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, +	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, +	"Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, +	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, +	"Configure IO queues for write IO"); +#endif  /* END CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a57f0727db31..113943026d59 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -494,8 +494,9 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)  	return (1);  } -void -spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +static void +spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, +    int mmp_flag)  {  	(void) tag;  	int wlocks_held = 0; @@ -510,7 +511,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)  			continue;  		mutex_enter(&scl->scl_lock);  		if (rw == RW_READER) { -			while (scl->scl_writer || scl->scl_write_wanted) { +			while (scl->scl_writer || +			    (!mmp_flag && scl->scl_write_wanted)) {  				cv_wait(&scl->scl_cv, &scl->scl_lock);  			}  		} else { @@ -529,6 +531,27 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)  }  void +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ +	spa_config_enter_impl(spa, locks, tag, rw, 0); +} + +/* + * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * outstanding write lock requests. This is needed since the mmp updates are + * time sensitive and failure to service them promptly will result in a + * suspended pool. This pool suspension has been seen in practice when there is + * a single disk in a pool that is responding slowly and presumably about to + * fail. + */ + +void +spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +{ +	spa_config_enter_impl(spa, locks, tag, rw, 1); +} + +void  spa_config_exit(spa_t *spa, int locks, const void *tag)  {  	(void) tag; @@ -2564,7 +2587,6 @@ spa_scan_stat_init(spa_t *spa)  	spa->spa_scan_pass_scrub_spent_paused = 0;  	spa->spa_scan_pass_exam = 0;  	spa->spa_scan_pass_issued = 0; -	vdev_scan_stat_init(spa->spa_root_vdev);  }  /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4b9d7e7c0506..57259b8ce88e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -28,7 +28,7 @@   * Copyright 2017 Joyent, Inc.   * Copyright (c) 2017, Intel Corporation.   * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.   */  #include <sys/zfs_context.h> @@ -2646,6 +2646,17 @@ vdev_reopen(vdev_t *vd)  	}  	/* +	 * Recheck if resilver is still needed and cancel any +	 * scheduled resilver if resilver is unneeded. +	 */ +	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && +	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) { +		mutex_enter(&spa->spa_async_lock); +		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; +		mutex_exit(&spa->spa_async_lock); +	} + +	/*  	 * Reassess parent vdev's health.  	 */  	vdev_propagate_state(vd); @@ -3983,11 +3994,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid)  		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));  	/* -	 * If the vdev is already removed, then don't do anything. +	 * If the vdev is already removed, or expanding which can trigger +	 * repartition add/remove events, then don't do anything.  	 */ -	if (vd->vdev_removed) +	if (vd->vdev_removed || vd->vdev_expanding)  		return (spa_vdev_state_exit(spa, NULL, 0)); +	/* +	 * Confirm the vdev has been removed, otherwise don't do anything. +	 */ +	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) +		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); +  	vd->vdev_remove_wanted = B_TRUE;  	spa_async_request(spa, SPA_ASYNC_REMOVE); @@ -4085,9 +4103,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)  	if (wasoffline ||  	    (oldstate < VDEV_STATE_DEGRADED && -	    vd->vdev_state >= VDEV_STATE_DEGRADED)) +	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {  		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); +		/* +		 * Asynchronously detach spare vdev if resilver or +		 * rebuild is not required +		 */ +		if (vd->vdev_unspare && +		    !dsl_scan_resilvering(spa->spa_dsl_pool) && +		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && +		    !vdev_rebuild_active(tvd)) +			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); +	}  	return (spa_vdev_state_exit(spa, vd, 0));  } diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 8762855d46aa..9e4c115f212c 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -270,7 +270,7 @@ typedef struct indirect_split {  	 */  	indirect_child_t *is_good_child; -	indirect_child_t is_child[1]; /* variable-length */ +	indirect_child_t is_child[];  } indirect_split_t;  /* diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 6ffd0d618fdd..5d90fd67cc2f 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -101,6 +101,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)  }  static void +vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx) +{ +	uint64_t guid = *(uint64_t *)arg; + +	kmem_free(arg, sizeof (uint64_t)); + +	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); +	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) +		return; + +	ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE); +	ASSERT3U(vd->vdev_leaf_zap, !=, 0); + +	vd->vdev_initialize_last_offset = 0; +	vd->vdev_initialize_action_time = 0; + +	objset_t *mos = vd->vdev_spa->spa_meta_objset; +	int error; + +	error = zap_remove(mos, vd->vdev_leaf_zap, +	    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx); +	VERIFY(error == 0 || error == ENOENT); + +	error = zap_remove(mos, vd->vdev_leaf_zap, +	    VDEV_LEAF_ZAP_INITIALIZE_STATE, tx); +	VERIFY(error == 0 || error == ENOENT); + +	error = zap_remove(mos, vd->vdev_leaf_zap, +	    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx); +	VERIFY(error == 0 || error == ENOENT); +} + +static void  vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)  {  	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); @@ -127,8 +160,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)  	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);  	VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); -	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, -	    guid, tx); + +	if (new_state != VDEV_INITIALIZE_NONE) { +		dsl_sync_task_nowait(spa_get_dsl(spa), +		    vdev_initialize_zap_update_sync, guid, tx); +	} else { +		dsl_sync_task_nowait(spa_get_dsl(spa), +		    vdev_initialize_zap_remove_sync, guid, tx); +	}  	switch (new_state) {  	case VDEV_INITIALIZE_ACTIVE: @@ -149,6 +188,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)  		spa_history_log_internal(spa, "initialize", tx,  		    "vdev=%s complete", vd->vdev_path);  		break; +	case VDEV_INITIALIZE_NONE: +		spa_history_log_internal(spa, "uninitialize", tx, +		    "vdev=%s", vd->vdev_path); +		break;  	default:  		panic("invalid state %llu", (unsigned long long)new_state);  	} @@ -605,6 +648,24 @@ vdev_initialize(vdev_t *vd)  }  /* + * Uninitializes a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_uninitialize(vdev_t *vd) +{ +	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); +	ASSERT(vd->vdev_ops->vdev_op_leaf); +	ASSERT(vdev_is_concrete(vd)); +	ASSERT3P(vd->vdev_initialize_thread, ==, NULL); +	ASSERT(!vd->vdev_detached); +	ASSERT(!vd->vdev_initialize_exit_wanted); +	ASSERT(!vd->vdev_top->vdev_removing); + +	vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE); +} + +/*   * Wait for the initialize thread to be terminated (cancelled or stopped).   */  static void @@ -760,6 +821,7 @@ vdev_initialize_restart(vdev_t *vd)  }  EXPORT_SYMBOL(vdev_initialize); +EXPORT_SYMBOL(vdev_uninitialize);  EXPORT_SYMBOL(vdev_initialize_stop);  EXPORT_SYMBOL(vdev_initialize_stop_all);  EXPORT_SYMBOL(vdev_initialize_stop_wait); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index ec6bbc6fc610..277c14ec1ad7 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -468,6 +468,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,  	if (vd->vdev_isspare)  		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); +	if (flags & VDEV_CONFIG_L2CACHE) +		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); +  	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&  	    vd == vd->vdev_top) {  		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, @@ -1100,6 +1103,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)  		    POOL_STATE_L2CACHE) == 0);  		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,  		    vd->vdev_guid) == 0); + +		/* +		 * This is merely to facilitate reporting the ashift of the +		 * cache device through zdb. The actual retrieval of the +		 * ashift (in vdev_alloc()) uses the nvlist +		 * spa->spa_l2cache->sav_config (populated in +		 * spa_ld_open_aux_vdevs()). +		 */ +		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, +		    vd->vdev_ashift) == 0);  	} else {  		uint64_t txg = 0ULL; diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 9dfbe0cf6f30..b180fa14682e 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -34,6 +34,7 @@  #include <sys/zio.h>  #include <sys/dmu_tx.h>  #include <sys/arc.h> +#include <sys/arc_impl.h>  #include <sys/zap.h>  /* @@ -116,13 +117,12 @@ unsigned long zfs_rebuild_max_segment = 1024 * 1024;   * segment size is also large (zfs_rebuild_max_segment=1M).  This helps keep   * the queue depth short.   * - * 32MB was selected as the default value to achieve good performance with - * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential - * rebuild was unable to saturate all of the drives using smaller values. - * With a value of 32MB the sequential resilver write rate was measured at - * 800MB/s sustained while rebuilding to a distributed spare. + * 64MB was observed to deliver the best performance and set as the default. + * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) + * and a rebuild rate of 1.2GB/s was measured to the distribute spare. + * Smaller values were unable to fully saturate the available pool I/O.   */ -unsigned long zfs_rebuild_vdev_limit = 32 << 20; +unsigned long zfs_rebuild_vdev_limit = 64 << 20;  /*   * Automatically start a pool scrub when the last active sequential resilver @@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg)  {  	vdev_t *vd = arg;  	spa_t *spa = vd->vdev_spa; +	vdev_t *rvd = spa->spa_root_vdev;  	int error = 0;  	/* @@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg)  	vr->vr_pass_bytes_scanned = 0;  	vr->vr_pass_bytes_issued = 0; -	vr->vr_bytes_inflight_max = MAX(1ULL << 20, -	    zfs_rebuild_vdev_limit * vd->vdev_children); -  	uint64_t update_est_time = gethrtime();  	vdev_rebuild_update_bytes_est(vd, 0); @@ -805,6 +803,17 @@ vdev_rebuild_thread(void *arg)  		vr->vr_scan_msp = msp;  		/* +		 * Calculate the max number of in-flight bytes for top-level +		 * vdev scanning operations (minimum 1MB, maximum 1/4 of +		 * arc_c_max shared by all top-level vdevs).  Limits for the +		 * issuing phase are done per top-level vdev and are handled +		 * separately. +		 */ +		uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); +		vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, +		    zfs_rebuild_vdev_limit * vd->vdev_children)); + +		/*  		 * Removal of vdevs from the vdev tree may eliminate the need  		 * for the rebuild, in which case it should be canceled.  The  		 * vdev_rebuild_cancel_wanted flag is set until the sync task diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 92daed48f3d5..c0ce2ac28dc5 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -23,6 +23,7 @@   * Copyright (c) 2016 by Delphix. All rights reserved.   * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.   * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright 2023 RackTop Systems, Inc.   */  #include <sys/spa.h> @@ -572,6 +573,7 @@ vdev_trim_ranges(trim_args_t *ta)  	uint64_t extent_bytes_max = ta->trim_extent_bytes_max;  	uint64_t extent_bytes_min = ta->trim_extent_bytes_min;  	spa_t *spa = vd->vdev_spa; +	int error = 0;  	ta->trim_start_time = gethrtime();  	ta->trim_bytes_done = 0; @@ -591,19 +593,32 @@ vdev_trim_ranges(trim_args_t *ta)  		uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;  		for (uint64_t w = 0; w < writes_required; w++) { -			int error; -  			error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +  			    rs_get_start(rs, ta->trim_tree) +  			    (w *extent_bytes_max), MIN(size -  			    (w * extent_bytes_max), extent_bytes_max));  			if (error != 0) { -				return (error); +				goto done;  			}  		}  	} -	return (0); +done: +	/* +	 * Make sure all TRIMs for this metaslab have completed before +	 * returning. TRIM zios have lower priority over regular or syncing +	 * zios, so all TRIM zios for this metaslab must complete before the +	 * metaslab is re-enabled. Otherwise it's possible write zios to +	 * this metaslab could cut ahead of still queued TRIM zios for this +	 * metaslab causing corruption if the ranges overlap. +	 */ +	mutex_enter(&vd->vdev_trim_io_lock); +	while (vd->vdev_trim_inflight[0] > 0) { +		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); +	} +	mutex_exit(&vd->vdev_trim_io_lock); + +	return (error);  }  static void @@ -922,11 +937,6 @@ vdev_trim_thread(void *arg)  	}  	spa_config_exit(spa, SCL_CONFIG, FTAG); -	mutex_enter(&vd->vdev_trim_io_lock); -	while (vd->vdev_trim_inflight[0] > 0) { -		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); -	} -	mutex_exit(&vd->vdev_trim_io_lock);  	range_tree_destroy(ta.trim_tree); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index a4b391cbea12..f441328f3018 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3985,7 +3985,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)  	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||  	    cmd_type == POOL_INITIALIZE_START || -	    cmd_type == POOL_INITIALIZE_SUSPEND)) { +	    cmd_type == POOL_INITIALIZE_SUSPEND || +	    cmd_type == POOL_INITIALIZE_UNINIT)) {  		return (SET_ERROR(EINVAL));  	} diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b9498d17ee2f..0987fd0f7bb7 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)  	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {  		ZFS_ENTER(zfsvfs);  		ZFS_VERIFY_ZP(zp); +		atomic_inc_32(&zp->z_sync_writes_cnt);  		zil_commit(zfsvfs->z_log, zp->z_id); +		atomic_dec_32(&zp->z_sync_writes_cnt);  		ZFS_EXIT(zfsvfs);  	}  	tsd_set(zfs_fsyncer_key, NULL); @@ -102,7 +104,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)  		hole = B_FALSE;  	/* Flush any mmap()'d data to disk */ -	if (zn_has_cached_data(zp)) +	if (zn_has_cached_data(zp, 0, file_sz - 1))  		zn_flush_cached_data(zp, B_FALSE);  	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); @@ -275,7 +277,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)  			error = mappedread_sf(zp, nbytes, uio);  		else  #endif -		if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { +		if (zn_has_cached_data(zp, zfs_uio_offset(uio), +		    zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {  			error = mappedread(zp, nbytes, uio);  		} else {  			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), @@ -686,7 +689,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)  			zfs_uioskip(uio, nbytes);  			tx_bytes = nbytes;  		} -		if (tx_bytes && zn_has_cached_data(zp) && +		if (tx_bytes && +		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&  		    !(ioflag & O_DIRECT)) {  			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);  		} diff --git a/module/zfs/zil.c b/module/zfs/zil.c index aaf509a2fc73..a4f7c008935d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -226,11 +226,10 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)   */  static int  zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, -    blkptr_t *nbp, void *dst, char **end) +    blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)  {  	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;  	arc_flags_t aflags = ARC_FLAG_WAIT; -	arc_buf_t *abuf = NULL;  	zbookmark_phys_t zb;  	int error; @@ -247,7 +246,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,  	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);  	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, -	    &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); +	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);  	if (error == 0) {  		zio_cksum_t cksum = bp->blk_cksum; @@ -262,23 +261,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,  		 */  		cksum.zc_word[ZIL_ZC_SEQ]++; +		uint64_t size = BP_GET_LSIZE(bp);  		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { -			zil_chain_t *zilc = abuf->b_data; +			zil_chain_t *zilc = (*abuf)->b_data;  			char *lr = (char *)(zilc + 1); -			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);  			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, -			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { +			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || +			    zilc->zc_nused < sizeof (*zilc) || +			    zilc->zc_nused > size) {  				error = SET_ERROR(ECKSUM);  			} else { -				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); -				bcopy(lr, dst, len); -				*end = (char *)dst + len; +				*begin = lr; +				*end = lr + zilc->zc_nused - sizeof (*zilc);  				*nbp = zilc->zc_next_blk;  			}  		} else { -			char *lr = abuf->b_data; -			uint64_t size = BP_GET_LSIZE(bp); +			char *lr = (*abuf)->b_data;  			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;  			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, @@ -286,15 +285,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,  			    (zilc->zc_nused > (size - sizeof (*zilc)))) {  				error = SET_ERROR(ECKSUM);  			} else { -				ASSERT3U(zilc->zc_nused, <=, -				    SPA_OLD_MAXBLOCKSIZE); -				bcopy(lr, dst, zilc->zc_nused); -				*end = (char *)dst + zilc->zc_nused; +				*begin = lr; +				*end = lr + zilc->zc_nused;  				*nbp = zilc->zc_next_blk;  			}  		} - -		arc_buf_destroy(abuf, &abuf);  	}  	return (error); @@ -362,7 +357,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  	uint64_t blk_count = 0;  	uint64_t lr_count = 0;  	blkptr_t blk, next_blk; -	char *lrbuf, *lrp;  	int error = 0;  	bzero(&next_blk, sizeof (blkptr_t)); @@ -382,13 +376,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  	 * If the log has been claimed, stop if we encounter a sequence  	 * number greater than the highest claimed sequence number.  	 */ -	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);  	zil_bp_tree_init(zilog);  	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {  		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];  		int reclen; -		char *end = NULL; +		char *lrp, *end; +		arc_buf_t *abuf = NULL;  		if (blk_seq > claim_blk_seq)  			break; @@ -404,8 +398,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  			break;  		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, -		    lrbuf, &end); +		    &lrp, &end, &abuf);  		if (error != 0) { +			if (abuf) +				arc_buf_destroy(abuf, &abuf);  			if (claimed) {  				char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -418,20 +414,25 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  			break;  		} -		for (lrp = lrbuf; lrp < end; lrp += reclen) { +		for (; lrp < end; lrp += reclen) {  			lr_t *lr = (lr_t *)lrp;  			reclen = lr->lrc_reclen;  			ASSERT3U(reclen, >=, sizeof (lr_t)); -			if (lr->lrc_seq > claim_lr_seq) +			if (lr->lrc_seq > claim_lr_seq) { +				arc_buf_destroy(abuf, &abuf);  				goto done; +			}  			error = parse_lr_func(zilog, lr, arg, txg); -			if (error != 0) +			if (error != 0) { +				arc_buf_destroy(abuf, &abuf);  				goto done; +			}  			ASSERT3U(max_lr_seq, <, lr->lrc_seq);  			max_lr_seq = lr->lrc_seq;  			lr_count++;  		} +		arc_buf_destroy(abuf, &abuf);  	}  done:  	zilog->zl_parse_error = error; @@ -441,7 +442,6 @@ done:  	zilog->zl_parse_lr_count = lr_count;  	zil_bp_tree_fini(zilog); -	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);  	return (error);  } @@ -1593,6 +1593,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)  		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);  		ASSERT3U(wsz, <=, lwb->lwb_sz);  		zio_shrink(lwb->lwb_write_zio, wsz); +		wsz = lwb->lwb_write_zio->io_size;  	} else {  		wsz = lwb->lwb_sz; @@ -2848,7 +2849,14 @@ static void  zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)  {  	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); -	VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + +	/* +	 * Since we are not going to create any new dirty data, and we +	 * can even help with clearing the existing dirty data, we +	 * should not be subject to the dirty data based delays. We +	 * use TXG_NOTHROTTLE to bypass the delay mechanism. +	 */ +	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));  	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));  	itx->itx_sync = B_TRUE; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 700f8791045f..c367ef7211aa 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2287,7 +2287,7 @@ zio_nowait(zio_t *zio)  	ASSERT3P(zio->io_executor, ==, NULL);  	if (zio->io_child_type == ZIO_CHILD_LOGICAL && -	    zio_unique_parent(zio) == NULL) { +	    list_is_empty(&zio->io_parent_list)) {  		zio_t *pio;  		/* | 
