From 7420e614fe3019e7af744adc5a1677b9a87c5d77 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 2 Aug 2018 23:40:28 +0000 Subject: Fix build after r337196 mismerge. --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/cddl/contrib') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index e7f30d4a6fda..48675909365b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -3552,7 +3552,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int d; - count_block(dp->dp_blkstats, bp); + count_block(scn, dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) -- cgit v1.2.3 From f58c851d32650a377fc4bdaeac5df8121bbb9abb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 3 Aug 2018 02:16:45 +0000 Subject: Reduce taskq and context-switch cost of zio pipe When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the logical zio_read(), and then a physical zio. Currently, each of these results in a separate taskq_dispatch(zio_execute). On high-read-iops workloads, this causes a significant performance impact. By processing all 3 ZIO's in a single taskq entry, we reduce the overhead on taskq locking and context switching. We accomplish this by allowing zio_done() to return a "next zio to execute" to zio_execute(). This results in a ~12% performance increase for random reads, from 96,000 iops to 108,000 iops (with recordsize=8k, on SSD's). Reviewed by: Pavel Zakharov Reviewed-by: Brian Behlendorf Reviewed by: George Wilson Signed-off-by: Matthew Ahrens External-issue: DLPX-59292 Closes #7736 zfsonlinux/zfs@62840030a7dceaee013ddbcc1eebcfc7922edf7c --- .../opensolaris/uts/common/fs/zfs/sys/zio.h | 4 +- .../contrib/opensolaris/uts/common/fs/zfs/zio.c | 254 +++++++++++---------- 2 files changed, 139 insertions(+), 119 deletions(-) (limited to 'sys/cddl/contrib') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 5f377734bced..80a24b436a01 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -217,7 +217,7 @@ enum zio_child { #define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) #define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) #define ZIO_CHILD_ALL_BITS \ - (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ + (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) enum zio_wait_type { @@ -356,7 +356,7 @@ typedef struct zio_transform { struct zio_transform *zt_next; } zio_transform_t; -typedef int zio_pipe_stage_t(zio_t *zio); +typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index ccf745543c94..53d0f4d27b08 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -100,9 +100,6 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; extern vmem_t *zio_alloc_arena; #endif -#define ZIO_PIPELINE_CONTINUE 0x100 -#define ZIO_PIPELINE_STOP 0x101 - #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul @@ -539,7 +536,8 @@ zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) } static void -zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) +zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, + zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; @@ -558,13 +556,33 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); + /* - * Dispatch the parent zio in its own taskq so that - * the child can continue to make progress. This also - * prevents overflowing the stack when we have deeply nested - * parent-child relationships. + * If we can tell the caller to execute this parent next, do + * so. Otherwise dispatch the parent zio as its own task. + * + * Having the caller execute the parent when possible reduces + * locking on the zio taskq's, reduces context switch + * overhead, and has no recursion penalty. Note that one + * read from disk typically causes at least 3 zio's: a + * zio_null(), the logical zio_read(), and then a physical + * zio. When the physical ZIO completes, we are able to call + * zio_done() on all 3 of these zio's from one invocation of + * zio_execute() by returning the parent back to + * zio_execute(). Since the parent isn't executed until this + * thread returns back to zio_execute(), the caller should do + * so promptly. + * + * In other cases, dispatching the parent prevents + * overflowing the stack when we have deeply nested + * parent-child relationships, as we do with the "mega zio" + * of writes for spa_sync(), and the chain of ZIL blocks. */ - zio_taskq_dispatch(pio, type, B_FALSE); + if (next_to_executep != NULL && *next_to_executep == NULL) { + *next_to_executep = pio; + } else { + zio_taskq_dispatch(pio, type, B_FALSE); + } } else { mutex_exit(&pio->io_lock); } @@ -1275,7 +1293,7 @@ zio_shrink(zio_t *zio, uint64_t size) * ========================================================================== */ -static int +static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1312,14 +1330,14 @@ zio_read_bp_init(zio_t *zio) if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); @@ -1334,7 +1352,7 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); /* * If we've been overridden and nopwrite is set then @@ -1345,13 +1363,13 @@ zio_write_bp_init(zio_t *zio) ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); @@ -1359,7 +1377,7 @@ zio_write_bp_init(zio_t *zio) if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -1371,10 +1389,10 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline = zio->io_orig_pipeline; } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1393,11 +1411,11 @@ zio_write_compress(zio_t *zio) */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } if (!IO_IS_ALLOCATING(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); if (zio->io_children_ready != NULL) { /* @@ -1456,7 +1474,7 @@ zio_write_compress(zio_t *zio) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } else { /* * Round up compressed size up to the ashift @@ -1544,10 +1562,10 @@ zio_write_compress(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1559,7 +1577,7 @@ zio_free_bp_init(zio_t *zio) ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -1633,12 +1651,12 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q) return (B_FALSE); } -static int +static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (ZIO_PIPELINE_STOP); + return (NULL); } void @@ -1720,14 +1738,13 @@ static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) { - zio->io_executor = curthread; - ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; - int rv; + + zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); @@ -1758,12 +1775,16 @@ zio_execute(zio_t *zio) zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; - rv = zio_pipeline[highbit64(stage) - 1](zio); - if (rv == ZIO_PIPELINE_STOP) - return; + /* + * The zio pipeline stage returns the next zio to execute + * (typically the same as this one), or NULL if we should + * stop. + */ + zio = zio_pipeline[highbit64(stage) - 1](zio); - ASSERT(rv == ZIO_PIPELINE_CONTINUE); + if (zio == NULL) + return; } } @@ -2226,7 +2247,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, zio_nowait(zio); } -static int +static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -2238,16 +2259,16 @@ zio_gang_assemble(zio_t *zio) zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); @@ -2261,7 +2282,7 @@ zio_gang_issue(zio_t *zio) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } static void @@ -2300,7 +2321,7 @@ zio_write_gang_done(zio_t *zio) abd_put(zio->io_abd); } -static int +static zio_t * zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; @@ -2359,7 +2380,7 @@ zio_write_gang_block(zio_t *pio) gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; - return (ZIO_PIPELINE_CONTINUE); + return (pio); } if (pio == gio) { @@ -2426,7 +2447,7 @@ zio_write_gang_block(zio_t *pio) zio_nowait(zio); - return (ZIO_PIPELINE_CONTINUE); + return (pio); } /* @@ -2447,7 +2468,7 @@ zio_write_gang_block(zio_t *pio) * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ -static int +static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -2474,7 +2495,7 @@ zio_nop_write(zio_t *zio) BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); /* * If the checksums match then reset the pipeline so that we @@ -2494,7 +2515,7 @@ zio_nop_write(zio_t *zio) zio->io_flags |= ZIO_FLAG_NOPWRITE; } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -2522,7 +2543,7 @@ zio_ddt_child_read_done(zio_t *zio) mutex_exit(&pio->io_lock); } -static int +static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -2542,7 +2563,7 @@ zio_ddt_read_start(zio_t *zio) zio->io_vsd = dde; if (ddp_self == NULL) - return (ZIO_PIPELINE_CONTINUE); + return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) @@ -2555,23 +2576,23 @@ zio_ddt_read_start(zio_t *zio) zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(BP_GET_DEDUP(bp)); @@ -2583,12 +2604,12 @@ zio_ddt_read_done(zio_t *zio) ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (ZIO_PIPELINE_STOP); + return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, @@ -2601,7 +2622,7 @@ zio_ddt_read_done(zio_t *zio) ASSERT(zio->io_vsd == NULL); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } static boolean_t @@ -2759,7 +2780,7 @@ zio_ddt_ditto_write_done(zio_t *zio) ddt_exit(ddt); } -static int +static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -2803,7 +2824,7 @@ zio_ddt_write(zio_t *zio) ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); @@ -2829,7 +2850,7 @@ zio_ddt_write(zio_t *zio) zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, @@ -2871,12 +2892,12 @@ zio_ddt_write(zio_t *zio) if (dio) zio_nowait(dio); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ddt_entry_t *freedde; /* for debugging */ -static int +static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -2894,7 +2915,7 @@ zio_ddt_free(zio_t *zio) ddt_phys_decref(ddp); ddt_exit(ddt); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -2932,7 +2953,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) return (zio); } -static int +static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -2942,7 +2963,7 @@ zio_dva_throttle(zio_t *zio) !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); @@ -2968,22 +2989,7 @@ zio_dva_throttle(zio_t *zio) nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); - if (nio == zio) - return (ZIO_PIPELINE_CONTINUE); - - if (nio != NULL) { - ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); - /* - * We are passing control to a new zio so make sure that - * it is processed by a different thread. We do this to - * avoid stack overflows that can occur when parents are - * throttled and children are making progress. We allow - * it to go to the head of the taskq since it's already - * been waiting. - */ - zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); - } - return (ZIO_PIPELINE_STOP); + return (nio); } void @@ -3002,7 +3008,7 @@ zio_allocate_dispatch(spa_t *spa, int allocator) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } -static int +static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -3045,18 +3051,18 @@ zio_dva_allocate(zio_t *zio) zio->io_error = error; } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_dva_claim(zio_t *zio) { int error; @@ -3065,7 +3071,7 @@ zio_dva_claim(zio_t *zio) if (error) zio->io_error = error; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3160,7 +3166,7 @@ zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ -static int +static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -3179,13 +3185,13 @@ zio_vdev_io_start(zio_t *zio) * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); - return (ZIO_PIPELINE_STOP); + return (NULL); } if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ASSERT3P(zio->io_logical, !=, zio); @@ -3299,24 +3305,24 @@ zio_vdev_io_start(zio_t *zio) !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } if (vd->vdev_ops->vdev_op_leaf) { switch (zio->io_type) { case ZIO_TYPE_READ: if (vdev_cache_read(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); /* FALLTHROUGH */ case ZIO_TYPE_WRITE: case ZIO_TYPE_FREE: if ((zio = vdev_queue_io(zio)) == NULL) - return (ZIO_PIPELINE_STOP); + return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); - return (ZIO_PIPELINE_STOP); + return (NULL); } break; } @@ -3328,14 +3334,14 @@ zio_vdev_io_start(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && !trim_map_write_start(zio)) - return (ZIO_PIPELINE_STOP); + return (NULL); } vd->vdev_ops->vdev_op_io_start(zio); - return (ZIO_PIPELINE_STOP); + return (NULL); } -static int +static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -3343,7 +3349,7 @@ zio_vdev_io_done(zio_t *zio) boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || @@ -3386,7 +3392,7 @@ zio_vdev_io_done(zio_t *zio) if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3444,13 +3450,13 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) zcr->zcr_free = zio_buf_free; } -static int +static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) @@ -3496,7 +3502,7 @@ zio_vdev_io_assess(zio_t *zio) zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); - return (ZIO_PIPELINE_STOP); + return (NULL); } /* @@ -3536,7 +3542,7 @@ zio_vdev_io_assess(zio_t *zio) zio->io_physdone(zio->io_logical); } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } void @@ -3571,7 +3577,7 @@ zio_vdev_io_bypass(zio_t *zio) * Generate and verify checksums * ========================================================================== */ -static int +static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -3585,7 +3591,7 @@ zio_checksum_generate(zio_t *zio) checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { @@ -3599,10 +3605,10 @@ zio_checksum_generate(zio_t *zio) zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; @@ -3617,7 +3623,7 @@ zio_checksum_verify(zio_t *zio) * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } @@ -3632,7 +3638,7 @@ zio_checksum_verify(zio_t *zio) } } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3675,7 +3681,7 @@ zio_worst_error(int e1, int e2) * I/O completion * ========================================================================== */ -static int +static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -3684,7 +3690,7 @@ zio_ready(zio_t *zio) if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } if (zio->io_ready) { @@ -3730,7 +3736,7 @@ zio_ready(zio_t *zio) */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); - zio_notify_parent(pio, zio, ZIO_WAIT_READY); + zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { @@ -3746,7 +3752,7 @@ zio_ready(zio_t *zio) zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3810,7 +3816,7 @@ zio_dva_throttle_done(zio_t *zio) zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } -static int +static zio_t * zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -3827,7 +3833,7 @@ zio_done(zio_t *zio) * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } /* @@ -4041,7 +4047,12 @@ zio_done(zio_t *zio) if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); - zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + /* + * This is a rare code path, so we don't + * bother with "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, + NULL); } } @@ -4053,7 +4064,11 @@ zio_done(zio_t *zio) */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; - zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + /* + * This is a rare code path, so we don't bother with + * "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend @@ -4074,7 +4089,7 @@ zio_done(zio_t *zio) ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(zio->io_child_count == 0); @@ -4104,12 +4119,17 @@ zio_done(zio_t *zio) zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); + /* + * We are done executing this zio. We may want to execute a parent + * next. See the comment in zio_notify_parent(). + */ + zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); - zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { @@ -4121,7 +4141,7 @@ zio_done(zio_t *zio) zio_destroy(zio); } - return (ZIO_PIPELINE_STOP); + return (next_to_execute); } /* -- cgit v1.2.3 From 648cfe57fd8b4a4505383515dbcfd9185ad29484 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 10 Aug 2018 06:42:08 +0000 Subject: Performance optimization of AVL tree comparator functions MFV: commit ee36c709c3d5f7040e1bd11f5c75318aa03e789f Author: Gvozden Neskovic Date: Sat Aug 27 20:12:53 2016 +0200 perf: 2.75x faster ddt_entry_compare() First 256bits of ddt_key_t is a block checksum, which are expected to be close to random data. Hence, on average, comparison only needs to look at first few bytes of the keys. To reduce number of conditional jump instructions, the result is computed as: sign(memcmp(k1, k2)). Sign of an integer 'a' can be obtained as: `(0 < a) - (a < 0)` := {-1, 0, 1} , which is computed efficiently. Synthetic performance evaluation of original and new algorithm over 1G random keys on 2.6GHz Intel(R) Xeon(R) CPU E5-2660 v3: old 6.85789 s new 2.49089 s perf: 2.8x faster vdev_queue_offset_compare() and vdev_queue_timestamp_compare() Compute the result directly instead of using conditionals perf: zfs_range_compare() Speedup between 1.1x - 2.5x, depending on compiler version and optimization level. perf: spa_error_entry_compare() `bcmp()` is not suitable for comparator use. Use `memcmp()` instead. perf: 2.8x faster metaslab_compare() and metaslab_rangesize_compare() perf: 2.8x faster zil_bp_compare() perf: 2.8x faster mze_compare() perf: faster dbuf_compare() perf: faster compares in spa_misc perf: 2.8x faster layout_hash_compare() perf: 2.8x faster space_reftree_compare() perf: libzfs: faster avl tree comparators perf: guid_compare() perf: dsl_deadlist_compare() perf: perm_set_compare() perf: 2x faster range_tree_seg_compare() perf: faster unique_compare() perf: faster vdev_cache _compare() perf: faster vdev_uberblock_compare() perf: faster fuid _compare() perf: faster zfs_znode_hold_compare() Signed-off-by: Gvozden Neskovic Signed-off-by: Richard Elling Signed-off-by: Brian Behlendorf Closes #5033 --- .../opensolaris/uts/common/fs/zfs/dmu_send.c | 10 ++----- .../contrib/opensolaris/uts/common/fs/zfs/dnode.c | 26 +++++----------- .../opensolaris/uts/common/fs/zfs/dsl_deadlist.c | 11 ++----- .../opensolaris/uts/common/fs/zfs/dsl_deleg.c | 9 +++--- .../opensolaris/uts/common/fs/zfs/metaslab.c | 35 +++++++--------------- .../contrib/opensolaris/uts/common/fs/zfs/sa.c | 28 +++++++---------- .../contrib/opensolaris/uts/common/fs/zfs/spa.c | 13 +++----- .../opensolaris/uts/common/fs/zfs/spa_misc.c | 20 ++++--------- .../opensolaris/uts/common/fs/zfs/space_reftree.c | 18 ++++------- .../uts/common/fs/zfs/sys/zfs_context.h | 3 ++ .../contrib/opensolaris/uts/common/fs/zfs/unique.c | 12 +++----- .../opensolaris/uts/common/fs/zfs/vdev_label.c | 16 ++++------ .../opensolaris/uts/common/fs/zfs/vdev_queue.c | 17 ++++------- .../opensolaris/uts/common/fs/zfs/zap_micro.c | 14 ++++----- .../opensolaris/uts/common/fs/zfs/zfs_fuid.c | 19 +++++------- .../opensolaris/uts/common/fs/zfs/zfs_rlock.c | 12 +++----- .../contrib/opensolaris/uts/common/fs/zfs/zil.c | 28 ++++------------- sys/cddl/contrib/opensolaris/uts/common/sys/avl.h | 8 +++++ 18 files changed, 102 insertions(+), 197 deletions(-) (limited to 'sys/cddl/contrib') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 09b8c954ea40..25c1fec0c146 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -1906,14 +1906,10 @@ typedef struct guid_map_entry { static int guid_compare(const void *arg1, const void *arg2) { - const guid_map_entry_t *gmep1 = arg1; - const guid_map_entry_t *gmep2 = arg2; + const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; + const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - if (gmep1->guid < gmep2->guid) - return (-1); - else if (gmep1->guid > gmep2->guid) - return (1); - return (0); + return (AVL_CMP(gmep1->guid, gmep2->guid)); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index c64f68545f71..4d72991b5ef6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -78,19 +78,13 @@ dbuf_compare(const void *x1, const void *x2) const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; - if (d1->db_level < d2->db_level) { - return (-1); - } - if (d1->db_level > d2->db_level) { - return (1); - } + int cmp = AVL_CMP(d1->db_level, d2->db_level); + if (likely(cmp)) + return (cmp); - if (d1->db_blkid < d2->db_blkid) { - return (-1); - } - if (d1->db_blkid > d2->db_blkid) { - return (1); - } + cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + if (likely(cmp)) + return (cmp); if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); @@ -100,13 +94,7 @@ dbuf_compare(const void *x1, const void *x2) return (1); } - if ((uintptr_t)d1 < (uintptr_t)d2) { - return (-1); - } - if ((uintptr_t)d1 > (uintptr_t)d2) { - return (1); - } - return (0); + return (AVL_PCMP(d1, d2)); } /* ARGSUSED */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c index 356e5b51c3f4..2f3647bc8e86 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c @@ -55,15 +55,10 @@ static int dsl_deadlist_compare(const void *arg1, const void *arg2) { - const dsl_deadlist_entry_t *dle1 = arg1; - const dsl_deadlist_entry_t *dle2 = arg2; + const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; + const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; - if (dle1->dle_mintxg < dle2->dle_mintxg) - return (-1); - else if (dle1->dle_mintxg > dle2->dle_mintxg) - return (+1); - else - return (0); + return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c index 7870b4951b29..0ad658f910ec 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -384,14 +384,13 @@ typedef struct perm_set { static int perm_set_compare(const void *arg1, const void *arg2) { - const perm_set_t *node1 = arg1; - const perm_set_t *node2 = arg2; + const perm_set_t *node1 = (const perm_set_t *)arg1; + const perm_set_t *node2 = (const perm_set_t *)arg2; int val; val = strcmp(node1->p_setname, node2->p_setname); - if (val == 0) - return (0); - return (val > 0 ? 1 : -1); + + return (AVL_ISIGN(val)); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index c562675b1aec..e374cd356792 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -541,8 +541,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc) static int metaslab_compare(const void *x1, const void *x2) { - const metaslab_t *m1 = x1; - const metaslab_t *m2 = x2; + const metaslab_t *m1 = (const metaslab_t *)x1; + const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; @@ -568,22 +568,13 @@ metaslab_compare(const void *x1, const void *x2) if (sort1 > sort2) return (1); - if (m1->ms_weight < m2->ms_weight) - return (1); - if (m1->ms_weight > m2->ms_weight) - return (-1); - - /* - * If the weights are identical, use the offset to force uniqueness. - */ - if (m1->ms_start < m2->ms_start) - return (-1); - if (m1->ms_start > m2->ms_start) - return (1); + int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + if (likely(cmp)) + return (cmp); - ASSERT3P(m1, ==, m2); + IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); - return (0); + return (AVL_CMP(m1->ms_start, m2->ms_start)); } /* @@ -1195,18 +1186,14 @@ metaslab_rangesize_compare(const void *x1, const void *x2) uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - if (rs_size1 < rs_size2) - return (-1); - if (rs_size1 > rs_size2) - return (1); + int cmp = AVL_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); if (r1->rs_start < r2->rs_start) return (-1); - if (r1->rs_start > r2->rs_start) - return (1); - - return (0); + return (AVL_CMP(r1->rs_start, r2->rs_start)); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c index dd6e90c7796b..50f3c0ad822f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -242,31 +242,23 @@ sa_cache_fini(void) static int layout_num_compare(const void *arg1, const void *arg2) { - const sa_lot_t *node1 = arg1; - const sa_lot_t *node2 = arg2; + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; - if (node1->lot_num > node2->lot_num) - return (1); - else if (node1->lot_num < node2->lot_num) - return (-1); - return (0); + return (AVL_CMP(node1->lot_num, node2->lot_num)); } static int layout_hash_compare(const void *arg1, const void *arg2) { - const sa_lot_t *node1 = arg1; - const sa_lot_t *node2 = arg2; + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; - if (node1->lot_hash > node2->lot_hash) - return (1); - if (node1->lot_hash < node2->lot_hash) - return (-1); - if (node1->lot_instance > node2->lot_instance) - return (1); - if (node1->lot_instance < node2->lot_instance) - return (-1); - return (0); + int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(node1->lot_instance, node2->lot_instance)); } boolean_t diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index cbb064b7d201..a716592358ee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -905,19 +905,14 @@ spa_change_guid(spa_t *spa) static int spa_error_entry_compare(const void *a, const void *b) { - spa_error_entry_t *sa = (spa_error_entry_t *)a; - spa_error_entry_t *sb = (spa_error_entry_t *)b; + const spa_error_entry_t *sa = (const spa_error_entry_t *)a; + const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; - ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, + ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); + return (AVL_ISIGN(ret)); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 0c580ee27b3b..804518ce22a0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -1016,18 +1016,13 @@ typedef struct spa_aux { int aux_count; } spa_aux_t; -static int +static inline int spa_aux_compare(const void *a, const void *b) { - const spa_aux_t *sa = a; - const spa_aux_t *sb = b; + const spa_aux_t *sa = (const spa_aux_t *)a; + const spa_aux_t *sb = (const spa_aux_t *)b; - if (sa->aux_guid < sb->aux_guid) - return (-1); - else if (sa->aux_guid > sb->aux_guid) - return (1); - else - return (0); + return (AVL_CMP(sa->aux_guid, sb->aux_guid)); } void @@ -2053,11 +2048,8 @@ spa_name_compare(const void *a1, const void *a2) int s; s = strcmp(s1->spa_name, s2->spa_name); - if (s > 0) - return (1); - if (s < 0) - return (-1); - return (0); + + return (AVL_ISIGN(s)); } int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c index a866e65d54f7..aa289ba1061d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c @@ -54,20 +54,14 @@ static int space_reftree_compare(const void *x1, const void *x2) { - const space_ref_t *sr1 = x1; - const space_ref_t *sr2 = x2; + const space_ref_t *sr1 = (const space_ref_t *)x1; + const space_ref_t *sr2 = (const space_ref_t *)x2; - if (sr1->sr_offset < sr2->sr_offset) - return (-1); - if (sr1->sr_offset > sr2->sr_offset) - return (1); + int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + if (likely(cmp)) + return (cmp); - if (sr1 < sr2) - return (-1); - if (sr1 > sr2) - return (1); - - return (0); + return (AVL_PCMP(sr1, sr2)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h index 3ea0da4a1d33..04606bda48db 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -146,4 +146,7 @@ extern struct mtx zfs_debug_mtx; #define sys_shutdown rebooting +#define noinline __attribute__((noinline)) +#define likely(x) __builtin_expect((x), 1) + #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c index fbe7b619a29a..d33f451938b8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c @@ -42,14 +42,10 @@ typedef struct unique { static int unique_compare(const void *a, const void *b) { - const unique_t *una = a; - const unique_t *unb = b; - - if (una->un_value < unb->un_value) - return (-1); - if (una->un_value > unb->un_value) - return (+1); - return (0); + const unique_t *una = (const unique_t *)a; + const unique_t *unb = (const unique_t *)b; + + return (AVL_CMP(una->un_value, unb->un_value)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index ec8a869dc8c4..d66fa4ef822f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -1047,19 +1047,13 @@ retry: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int -vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - if (ub1->ub_txg < ub2->ub_txg) - return (-1); - if (ub1->ub_txg > ub2->ub_txg) - return (1); + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + if (likely(cmp)) + return (cmp); - if (ub1->ub_timestamp < ub2->ub_timestamp) - return (-1); - if (ub1->ub_timestamp > ub2->ub_timestamp) - return (1); - - return (0); + return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp)); } struct ubl_cbdata { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 0d60dfa1ac64..78b725a37b68 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -311,20 +311,15 @@ sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) int vdev_queue_offset_compare(const void *x1, const void *x2) { - const zio_t *z1 = x1; - const zio_t *z2 = x2; + const zio_t *z1 = (const zio_t *)x1; + const zio_t *z2 = (const zio_t *)x2; - if (z1->io_offset < z2->io_offset) - return (-1); - if (z1->io_offset > z2->io_offset) - return (1); + int cmp = AVL_CMP(z1->io_offset, z2->io_offset); - if (z1 < z2) - return (-1); - if (z1 > z2) - return (1); + if (likely(cmp)) + return (cmp); - return (0); + return (AVL_PCMP(z1, z2)); } static inline avl_tree_t * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index c18fb92830b1..50d5fc48f0c8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -281,15 +281,11 @@ mze_compare(const void *arg1, const void *arg2) const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - if (mze1->mze_hash > mze2->mze_hash) - return (+1); - if (mze1->mze_hash < mze2->mze_hash) - return (-1); - if (mze1->mze_cd > mze2->mze_cd) - return (+1); - if (mze1->mze_cd < mze2->mze_cd) - return (-1); - return (0); + int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); } static int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c index e74799a70fe0..581b6b1bfb64 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -71,14 +71,10 @@ static char *nulldomain = ""; static int idx_compare(const void *arg1, const void *arg2) { - const fuid_domain_t *node1 = arg1; - const fuid_domain_t *node2 = arg2; + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - if (node1->f_idx < node2->f_idx) - return (-1); - else if (node1->f_idx > node2->f_idx) - return (1); - return (0); + return (AVL_CMP(node1->f_idx, node2->f_idx)); } /* @@ -87,14 +83,13 @@ idx_compare(const void *arg1, const void *arg2) static int domain_compare(const void *arg1, const void *arg2) { - const fuid_domain_t *node1 = arg1; - const fuid_domain_t *node2 = arg2; + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; int val; val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - if (val == 0) - return (0); - return (val > 0 ? 1 : -1); + + return (AVL_ISIGN(val)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c index b40bdbea123c..7743e81dd5f1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -594,12 +594,8 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) int zfs_range_compare(const void *arg1, const void *arg2) { - const rl_t *rl1 = arg1; - const rl_t *rl2 = arg2; - - if (rl1->r_off > rl2->r_off) - return (1); - if (rl1->r_off < rl2->r_off) - return (-1); - return (0); + const rl_t *rl1 = (const rl_t *)arg1; + const rl_t *rl2 = (const rl_t *)arg2; + + return (AVL_CMP(rl1->r_off, rl2->r_off)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 1a03d920b367..8cc65d6f31f8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -131,17 +131,11 @@ zil_bp_compare(const void *x1, const void *x2) const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) - return (-1); - if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) - return (1); - - if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) - return (-1); - if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) - return (1); + int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (likely(cmp)) + return (cmp); - return (0); + return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void @@ -503,12 +497,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - if (v1 < v2) - return (-1); - if (v1 > v2) - return (1); - - return (0); + return (AVL_CMP(v1, v2)); } static lwb_t * @@ -1626,12 +1615,7 @@ zil_aitx_compare(const void *x1, const void *x2) const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - if (o1 < o2) - return (-1); - if (o1 > o2) - return (1); - - return (0); + return (AVL_CMP(o1, o2)); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h index 10e0ddaeef88..fea46c90481d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h @@ -106,6 +106,14 @@ extern "C" { */ +/* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + /* * Type used for the root of the AVL tree. */ -- cgit v1.2.3 From 90df93417e2d4ee27bcb05ea7dbaf4115b2e0908 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 10 Aug 2018 23:42:11 +0000 Subject: ZFS/MFV: Use cached feature info in spa_add_feature_stats() commit 417104bdd3c7ce07ec58674dd078f9891c3bc780 Author: Ned Bass Date: Thu Feb 26 12:24:11 2015 -0800 Use cached feature info in spa_add_feature_stats() Avoid issuing I/O to the pool when retrieving feature flags information. Trying to read the ZAPs from disk means that zpool clear would hang if the pool is suspended and recovery would require a reboot. To keep the feature stats resident in memory, we hang a cached nvlist off of the spa. It is built up from disk the first time spa_add_feature_stats() is called, and refreshed thereafter using the cached feature reference counts. spa_add_feature_stats() gets called at pool import time so we can be sure the cached nvlist will be available if the pool is later suspended. Signed-off-by: Ned Bass Signed-off-by: Brian Behlendorf Closes #3082 --- .../contrib/opensolaris/uts/common/fs/zfs/spa.c | 66 ++++++++++++++++++---- .../opensolaris/uts/common/fs/zfs/spa_misc.c | 3 + .../opensolaris/uts/common/fs/zfs/sys/spa_impl.h | 2 + .../opensolaris/uts/common/fs/zfs/zfeature.c | 2 +- 4 files changed, 60 insertions(+), 13 deletions(-) (limited to 'sys/cddl/contrib') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index a716592358ee..c8a635ae54f3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -4371,18 +4371,14 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } static void -spa_add_feature_stats(spa_t *spa, nvlist_t *config) +spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { - nvlist_t *features; zap_cursor_t zc; zap_attribute_t za; - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); - /* We may be unable to read features if pool is suspended. */ if (spa_suspended(spa)) - goto out; + return; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, @@ -4391,7 +4387,7 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); - VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); @@ -4404,16 +4400,62 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); - VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } +} -out: - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, - features) == 0); - nvlist_free(features); +static void +spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t feature = spa_feature_table[i]; + uint64_t refcount; + + if (feature_get_refcount(spa, &feature, &refcount) != 0) + continue; + + VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); + } +} + +/* + * Store a list of pool features and their reference counts in the + * config. + * + * The first time this is called on a spa, allocate a new nvlist, fetch + * the pool features and reference counts from disk, then save the list + * in the spa. In subsequent calls on the same spa use the saved nvlist + * and refresh its values from the cached reference counts. This + * ensures we don't block here on I/O on a suspended pool so 'zpool + * clear' can resume the pool. + */ +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + mutex_enter(&spa->spa_feat_stats_lock); + features = spa->spa_feat_stats; + + if (features != NULL) { + spa_feature_stats_from_cache(spa, features); + } else { + VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); + spa->spa_feat_stats = features; + spa_feature_stats_from_disk(spa, features); + } + + VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features)); + + mutex_exit(&spa->spa_feat_stats_lock); } int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 804518ce22a0..2ceed8dd8040 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -708,6 +708,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -881,6 +882,7 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); + nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); #ifdef illumos @@ -922,6 +924,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); + mutex_destroy(&spa->spa_feat_stats_lock); kmem_free(spa, sizeof (spa_t)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 007e5e170427..f69dde66dd9b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -360,6 +360,8 @@ struct spa { uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ + kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ + nvlist_t *spa_feat_stats; /* Cache of enabled features */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; #ifdef illumos diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c index 78b2912df1d6..76003e3544f4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c @@ -220,7 +220,7 @@ spa_features_check(spa_t *spa, boolean_t for_write, * * Note: well-designed features will not need to use this; they should * use spa_feature_is_enabled() and spa_feature_is_active() instead. - * However, this is non-static for zdb and zhack. + * However, this is non-static for zdb, zhack, and spa_add_feature_stats(). */ int feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) -- cgit v1.2.3