aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/zio.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/zio.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c326
1 files changed, 221 insertions, 105 deletions
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 6d7bce8b0e10..218aec6093e2 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -692,7 +692,7 @@ error:
zio->io_error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark,
- BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, &zio->io_bookmark, zio, 0);
}
@@ -850,15 +850,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
mutex_enter(&pio->io_lock);
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
*errorp = zio_worst_error(*errorp, zio->io_error);
- pio->io_reexecute |= zio->io_reexecute;
+ pio->io_post |= zio->io_post;
ASSERT3U(*countp, >, 0);
- /*
- * Propogate the Direct I/O checksum verify failure to the parent.
- */
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
- pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
-
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@@ -1110,7 +1104,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
"DVA[1]=%#llx/%#llx "
"DVA[2]=%#llx/%#llx "
"prop=%#llx "
- "pad=%#llx,%#llx "
+ "prop2=%#llx "
+ "pad=%#llx "
"phys_birth=%#llx "
"birth=%#llx "
"fill=%#llx "
@@ -1123,9 +1118,9 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
(long long)bp->blk_dva[2].dva_word[0],
(long long)bp->blk_dva[2].dva_word[1],
(long long)bp->blk_prop,
- (long long)bp->blk_pad[0],
- (long long)bp->blk_pad[1],
- (long long)BP_GET_PHYSICAL_BIRTH(bp),
+ (long long)bp->blk_prop2,
+ (long long)bp->blk_pad,
+ (long long)BP_GET_RAW_PHYSICAL_BIRTH(bp),
(long long)BP_GET_LOGICAL_BIRTH(bp),
(long long)bp->blk_fill,
(long long)bp->blk_cksum.zc_word[0],
@@ -1340,7 +1335,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
{
zio_t *zio;
- zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
+ zio = zio_create(pio, spa, BP_GET_PHYSICAL_BIRTH(bp), bp,
data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1649,7 +1644,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* through the mirror during self healing. See comment in
* vdev_mirror_io_done() for more details.
*/
- ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+ ASSERT0(pio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
} else if (type == ZIO_TYPE_WRITE &&
pio->io_prop.zp_direct_write == B_TRUE) {
/*
@@ -1685,7 +1680,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* If this is a retried I/O then we ignore it since we will
* have already processed the original allocating I/O.
*/
- if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ if (flags & ZIO_FLAG_ALLOC_THROTTLED &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
ASSERT(pio->io_metaslab_class != NULL);
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
@@ -1695,7 +1690,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
pio->io_child_type == ZIO_CHILD_GANG);
- flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
}
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
@@ -1860,7 +1855,7 @@ zio_write_bp_init(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_prop_t *zp = &zio->io_prop;
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
+ ASSERT(BP_GET_BIRTH(bp) != zio->io_txg);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -1948,7 +1943,7 @@ zio_write_compress(zio_t *zio)
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
ASSERT(zio->io_bp_override == NULL);
- if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
+ if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
@@ -2085,7 +2080,7 @@ zio_write_compress(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
+ if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
VERIFY3U(psize, !=, 0);
@@ -2602,7 +2597,7 @@ zio_reexecute(void *arg)
pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
- pio->io_reexecute = 0;
+ pio->io_post = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_pipeline_trace = 0;
pio->io_error = 0;
@@ -2749,11 +2744,14 @@ zio_resume_wait(spa_t *spa)
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
- * A gang block consists of a gang header (zio_gbh_phys_t) and up to
- * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
- * an indirect block: it's an array of block pointers. It consumes
- * only one sector and hence is allocatable regardless of fragmentation.
- * The gang header's bps point to its gang members, which hold the data.
+ * A gang block consists of a a gang header and up to gbh_nblkptrs(size)
+ * gang members. The gang header is like an indirect block: it's an array
+ * of block pointers, though the header has a small tail (a zio_eck_t)
+ * that stores an embedded checksum. It is allocated using only a single
+ * sector as the requested size, and hence is allocatable regardless of
+ * fragmentation. Its size is determined by the smallest allocatable
+ * asize of the vdevs it was allocated on. The gang header's bps point
+ * to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
@@ -2832,10 +2830,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
if (gn != NULL) {
abd_t *gbh_abd =
- abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
@@ -2906,14 +2904,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
static void zio_gang_tree_assemble_done(zio_t *zio);
static zio_gang_node_t *
-zio_gang_node_alloc(zio_gang_node_t **gnpp)
+zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize)
{
zio_gang_node_t *gn;
ASSERT(*gnpp == NULL);
- gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
- gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+ gn = kmem_zalloc(sizeof (*gn) +
+ (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP);
+ gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize;
+ gn->gn_gbh = zio_buf_alloc(gangblocksize);
*gnpp = gn;
return (gn);
@@ -2924,11 +2924,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
ASSERT(gn->gn_child[g] == NULL);
- zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
- kmem_free(gn, sizeof (*gn));
+ zio_buf_free(gn->gn_gbh, gn->gn_allocsize);
+ kmem_free(gn, sizeof (*gn) +
+ (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn)));
*gnpp = NULL;
}
@@ -2940,7 +2941,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
if (gn == NULL)
return;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
@@ -2949,13 +2950,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
- zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
- abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ uint64_t gangblocksize = UINT64_MAX;
+ if (spa_feature_is_active(gio->io_spa,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+ spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER);
+ for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
+ vdev_t *vd = vdev_lookup_top(gio->io_spa,
+ DVA_GET_VDEV(&bp->blk_dva[dva]));
+ uint64_t psize = vdev_gang_header_psize(vd);
+ gangblocksize = MIN(gangblocksize, psize);
+ }
+ spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
+ } else {
+ gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ }
+ ASSERT3U(gangblocksize, !=, UINT64_MAX);
+ zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
- zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
@@ -2978,13 +2994,17 @@ zio_gang_tree_assemble_done(zio_t *zio)
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
- ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ /*
+ * If this was an old-style gangblock, the gangblocksize should have
+ * been updated in zio_checksum_error to reflect that.
+ */
+ ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic,
+ ==, ZEC_MAGIC);
abd_free(zio->io_abd);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+ blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (!BP_IS_GANG(gbp))
continue;
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
@@ -3009,10 +3029,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ ASSERT3U(gbh_eck(gn->gn_gbh,
+ gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+ blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
@@ -3119,6 +3140,13 @@ zio_write_gang_done(zio_t *zio)
abd_free(zio->io_abd);
}
+static void
+zio_update_feature(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx);
+}
+
static zio_t *
zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
{
@@ -3157,20 +3185,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_GANG_HEADER;
- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(has_data);
flags |= METASLAB_ASYNC_ALLOC;
}
- error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ uint64_t candidate = gangblocksize;
+ error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
- &pio->io_alloc_list, pio->io_allocator, pio);
+ &pio->io_alloc_list, pio->io_allocator, pio, &candidate);
if (error) {
pio->io_error = error;
return (pio);
}
+ if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+ gangblocksize = candidate;
if (pio == gio) {
gnpp = &gio->io_gang_tree;
@@ -3179,23 +3211,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(pio->io_ready == zio_write_gang_member_ready);
}
- gn = zio_gang_node_alloc(gnpp);
+ gn = zio_gang_node_alloc(gnpp, gangblocksize);
gbh = gn->gn_gbh;
- memset(gbh, 0, SPA_GANGBLOCKSIZE);
- gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+ memset(gbh, 0, gangblocksize);
+ gbh_abd = abd_get_from_buf(gbh, gangblocksize);
/*
* Create the gang header.
*/
- zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
zio_gang_inherit_allocator(pio, zio);
- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
boolean_t more;
- VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies,
- zio, B_TRUE, &more));
+ VERIFY(metaslab_class_throttle_reserve(mc, zio->io_allocator,
+ gbh_copies, zio->io_size, B_TRUE, &more));
+ zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED;
}
/*
@@ -3203,7 +3236,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* opportunistic allocations. If that fails to generate enough
* space, we fall back to normal zio_write calls for nested gang.
*/
- for (int g = 0; resid != 0; g++) {
+ int g;
+ boolean_t any_failed = B_FALSE;
+ for (g = 0; resid != 0; g++) {
flags &= METASLAB_ASYNC_ALLOC;
flags |= METASLAB_GANG_CHILD;
zp.zp_checksum = gio->io_prop.zp_checksum;
@@ -3224,9 +3259,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
uint64_t min_size = zio_roundup_alloc_size(spa,
- resid / (SPA_GBH_NBLKPTRS - g));
+ resid / (gbh_nblkptrs(gangblocksize) - g));
min_size = MIN(min_size, resid);
- bp = &gbh->zg_blkptr[g];
+ bp = &((blkptr_t *)gbh)[g];
zio_alloc_list_t cio_list;
metaslab_trace_init(&cio_list);
@@ -3236,6 +3271,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
boolean_t allocated = error == 0;
+ any_failed |= !allocated;
uint64_t psize = allocated ? MIN(resid, allocated_size) :
min_size;
@@ -3268,6 +3304,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
}
/*
+ * If we used more gang children than the old limit, we must already be
+ * using the new headers. No need to update anything, just move on.
+ *
+ * Otherwise, we might be in a case where we need to turn on the new
+ * feature, so we check that. We enable the new feature if we didn't
+ * manage to fit everything into 3 gang children and we could have
+ * written more than that.
+ */
+ if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) {
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER));
+ } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
+ !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+ dmu_tx_t *tx =
+ dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ zio_update_feature,
+ (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
* Set pio's pipeline to just wait for zio to finish.
*/
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -3836,7 +3895,7 @@ zio_ddt_write(zio_t *zio)
* block and leave.
*/
if (have_dvas == 0) {
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+ ASSERT(BP_GET_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_extend(ddp, v, bp);
ddt_phys_addref(ddp, v);
@@ -3864,6 +3923,23 @@ zio_ddt_write(zio_t *zio)
* then we can just use them as-is.
*/
if (have_dvas >= need_dvas) {
+ /*
+ * For rewrite operations, try preserving the original
+ * logical birth time. If the result matches the
+ * original BP, this becomes a NOP.
+ */
+ if (zp->zp_rewrite) {
+ uint64_t orig_logical_birth =
+ BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
+ ddt_bp_fill(ddp, v, bp, orig_logical_birth);
+ if (BP_EQUAL(bp, &zio->io_bp_orig)) {
+ /* We can skip accounting. */
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ ddt_exit(ddt);
+ return (zio);
+ }
+ }
+
ddt_bp_fill(ddp, v, bp, txg);
ddt_phys_addref(ddp, v);
ddt_exit(ddt);
@@ -4078,9 +4154,11 @@ zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more)
* reserve then we throttle.
*/
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
- zio->io_prop.zp_copies, zio, B_FALSE, more)) {
+ zio->io_allocator, zio->io_prop.zp_copies, zio->io_size,
+ B_FALSE, more)) {
return (NULL);
}
+ zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED;
avl_remove(&mca->mca_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
@@ -4164,8 +4242,10 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG);
memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva,
3 * sizeof (dva_t));
- BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig),
- BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig));
+ BP_SET_LOGICAL_BIRTH(zio->io_bp,
+ BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig));
+ BP_SET_PHYSICAL_BIRTH(zio->io_bp,
+ BP_GET_RAW_PHYSICAL_BIRTH(&zio->io_bp_orig));
return (zio);
}
@@ -4236,13 +4316,14 @@ again:
* If we are holding old class reservation, drop it.
* Dispatch the next ZIO(s) there if some are waiting.
*/
- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
if (metaslab_class_throttle_unreserve(mc,
- zio->io_prop.zp_copies, zio)) {
+ zio->io_allocator, zio->io_prop.zp_copies,
+ zio->io_size)) {
zio_allocate_dispatch(zio->io_metaslab_class,
zio->io_allocator);
}
- zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ zio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
}
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
@@ -4291,6 +4372,15 @@ again:
error);
}
zio->io_error = error;
+ } else if (zio->io_prop.zp_rewrite) {
+ /*
+ * For rewrite operations, preserve the logical birth time
+ * but set the physical birth time to the current txg.
+ */
+ uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
+ ASSERT3U(logical_birth, <=, zio->io_txg);
+ BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg);
+ BP_SET_REWRITE(zio->io_bp, 1);
}
return (zio);
@@ -4324,18 +4414,17 @@ zio_dva_claim(zio_t *zio)
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp)) {
- metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
- B_TRUE);
+ metaslab_free(zio->io_spa, bp, BP_GET_BIRTH(bp), B_TRUE);
}
if (gn != NULL) {
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
- &gn->gn_gbh->zg_blkptr[g]);
+ gbh_bp(gn->gn_gbh, g));
}
}
}
@@ -4347,7 +4436,7 @@ int
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t *slog)
{
- int error = 1;
+ int error;
zio_alloc_list_t io_alloc_list;
ASSERT(txg > spa_syncing_txg(spa));
@@ -4372,14 +4461,34 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object)
% spa->spa_alloc_count;
ZIOSTAT_BUMP(ziostat_total_allocations);
+
+ /* Try log class (dedicated slog devices) first */
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, allocator, NULL);
*slog = (error == 0);
+
+ /* Try special_embedded_log class (reserved on special vdevs) */
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
+ size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
+ allocator, NULL);
+ }
+
+ /* Try special class (general special vdev allocation) */
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_special_class(spa), size,
+ new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+ NULL);
+ }
+
+ /* Try embedded_log class (reserved on normal vdevs) */
if (error != 0) {
error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
}
+
+ /* Finally fall back to normal class */
if (error != 0) {
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
error = metaslab_alloc(spa, spa_normal_class(spa), size,
@@ -4722,7 +4831,7 @@ zio_vdev_io_assess(zio_t *zio)
* If a Direct I/O operation has a checksum verify error then this I/O
* should not attempt to be issued again.
*/
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) {
if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
ASSERT3U(zio->io_error, ==, EIO);
@@ -5031,7 +5140,7 @@ zio_checksum_verify(zio_t *zio)
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
}
- ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+ ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
!(zio->io_flags & ZIO_FLAG_SPECULATIVE));
@@ -5040,7 +5149,7 @@ zio_checksum_verify(zio_t *zio)
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
if (zio->io_flags & ZIO_FLAG_DIO_READ) {
- zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
zio_t *pio = zio_unique_parent(zio);
/*
* Any Direct I/O read that has a checksum
@@ -5090,7 +5199,7 @@ zio_dio_checksum_verify(zio_t *zio)
if ((error = zio_checksum_error(zio, NULL)) != 0) {
zio->io_error = error;
if (error == ECKSUM) {
- zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
}
}
@@ -5115,7 +5224,7 @@ zio_checksum_verified(zio_t *zio)
void
zio_dio_chksum_verify_error_report(zio_t *zio)
{
- ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+ ASSERT(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
return;
@@ -5187,7 +5296,7 @@ zio_ready(zio_t *zio)
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+ ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
@@ -5202,7 +5311,7 @@ zio_ready(zio_t *zio)
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_metaslab_class != NULL);
@@ -5213,8 +5322,8 @@ zio_ready(zio_t *zio)
* issue the next I/O to allocate.
*/
if (metaslab_class_throttle_unreserve(
- zio->io_metaslab_class, zio->io_prop.zp_copies,
- zio)) {
+ zio->io_metaslab_class, zio->io_allocator,
+ zio->io_prop.zp_copies, zio->io_size)) {
zio_allocate_dispatch(zio->io_metaslab_class,
zio->io_allocator);
}
@@ -5264,6 +5373,7 @@ zio_dva_throttle_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
const void *tag = pio;
+ uint64_t size = pio->io_size;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
@@ -5273,16 +5383,19 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT3P(vd, ==, vd->vdev_top);
ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
- ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED);
/*
* Parents of gang children can have two flavors -- ones that allocated
* the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
* allocated the constituent blocks. The first use their parent as tag.
+ * We set the size to match the original allocation call for that case.
*/
if (pio->io_child_type == ZIO_CHILD_GANG &&
- (pio->io_flags & ZIO_FLAG_IO_REWRITE))
+ (pio->io_flags & ZIO_FLAG_IO_REWRITE)) {
tag = zio_unique_parent(pio);
+ size = SPA_OLD_GANGBLOCKSIZE;
+ }
ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE)));
@@ -5295,9 +5408,10 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
- pio->io_allocator, flags, pio->io_size, tag);
+ pio->io_allocator, flags, size, tag);
- if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) {
+ if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
+ pio->io_allocator, 1, pio->io_size)) {
zio_allocate_dispatch(zio->io_metaslab_class,
pio->io_allocator);
}
@@ -5328,7 +5442,7 @@ zio_done(zio_t *zio)
* write. We must do this since the allocation is performed
* by the logical I/O but the actual write is done by child I/Os.
*/
- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED &&
zio->io_child_type == ZIO_CHILD_VDEV)
zio_dva_throttle_done(zio);
@@ -5337,8 +5451,6 @@ zio_done(zio_t *zio)
ASSERT(zio->io_children[c][w] == 0);
if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
- ASSERT(zio->io_bp->blk_pad[0] == 0);
- ASSERT(zio->io_bp->blk_pad[1] == 0);
ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
sizeof (blkptr_t)) == 0 ||
(zio->io_bp == zio_unique_parent(zio)->io_bp));
@@ -5431,7 +5543,7 @@ zio_done(zio_t *zio)
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd) &&
- !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+ !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
if (ret != EALREADY) {
@@ -5446,14 +5558,14 @@ zio_done(zio_t *zio)
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
- !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
+ !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) &&
zio == zio->io_logical) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
spa_log_error(zio->io_spa, &zio->io_bookmark,
- BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
}
@@ -5467,7 +5579,7 @@ zio_done(zio_t *zio)
*/
if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
zio->io_prop.zp_dedup) {
- zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+ zio->io_post |= ZIO_POST_REEXECUTE;
zio->io_prop.zp_dedup = B_FALSE;
}
/*
@@ -5479,11 +5591,11 @@ zio_done(zio_t *zio)
if (IO_IS_ALLOCATING(zio) &&
!(zio->io_flags & ZIO_FLAG_CANFAIL) &&
- !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+ !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
if (zio->io_error != ENOSPC)
- zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+ zio->io_post |= ZIO_POST_REEXECUTE;
else
- zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ zio->io_post |= ZIO_POST_SUSPEND;
}
if ((zio->io_type == ZIO_TYPE_READ ||
@@ -5492,10 +5604,11 @@ zio_done(zio_t *zio)
zio->io_error == ENXIO &&
spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
- zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ zio->io_post |= ZIO_POST_SUSPEND;
- if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
- zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+ !(zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)))
+ zio->io_post |= ZIO_POST_SUSPEND;
/*
* Here is a possibly good place to attempt to do
@@ -5514,7 +5627,8 @@ zio_done(zio_t *zio)
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
- if ((zio->io_error || zio->io_reexecute) &&
+ if ((zio->io_error ||
+ (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
@@ -5525,16 +5639,16 @@ zio_done(zio_t *zio)
* Godfather I/Os should never suspend.
*/
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
- (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
- zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
+ (zio->io_post & ZIO_POST_SUSPEND))
+ zio->io_post &= ~ZIO_POST_SUSPEND;
- if (zio->io_reexecute) {
+ if (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)) {
/*
* A Direct I/O operation that has a checksum verify error
* should not attempt to reexecute. Instead, the error should
* just be propagated back.
*/
- ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+ ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
/*
* This is a logical I/O that wants to reexecute.
@@ -5571,7 +5685,7 @@ zio_done(zio_t *zio)
pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
- (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+ (zio->io_post & ZIO_POST_SUSPEND)) {
zio_remove_child(pio, zio, remove_zl);
/*
* This is a rare code path, so we don't
@@ -5595,13 +5709,14 @@ zio_done(zio_t *zio)
* "next_to_execute".
*/
zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
- } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+ } else if (zio->io_post & ZIO_POST_SUSPEND) {
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
} else {
+ ASSERT(zio->io_post & ZIO_POST_REEXECUTE);
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
@@ -5614,7 +5729,8 @@ zio_done(zio_t *zio)
}
ASSERT(list_is_empty(&zio->io_child_list));
- ASSERT(zio->io_reexecute == 0);
+ ASSERT0(zio->io_post & ZIO_POST_REEXECUTE);
+ ASSERT0(zio->io_post & ZIO_POST_SUSPEND);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*