aboutsummaryrefslogtreecommitdiff
path: root/module/zfs/vdev_rebuild.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/vdev_rebuild.c')
-rw-r--r--module/zfs/vdev_rebuild.c55
1 files changed, 40 insertions, 15 deletions
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index aa7ed24deaeb..6503390f7973 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,7 @@
*
* Copyright (c) 2018, Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
*/
#include <sys/vdev_impl.h>
@@ -33,6 +34,7 @@
#include <sys/zio.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
+#include <sys/arc_impl.h>
#include <sys/zap.h>
/*
@@ -103,7 +105,7 @@
* Size of rebuild reads; defaults to 1MiB per data disk and is capped at
* SPA_MAXBLOCKSIZE.
*/
-static unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+static uint64_t zfs_rebuild_max_segment = 1024 * 1024;
/*
* Maximum number of parallelly executed bytes per leaf vdev caused by a
@@ -115,13 +117,12 @@ static unsigned long zfs_rebuild_max_segment = 1024 * 1024;
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
* the queue depth short.
*
- * 32MB was selected as the default value to achieve good performance with
- * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
- * rebuild was unable to saturate all of the drives using smaller values.
- * With a value of 32MB the sequential resilver write rate was measured at
- * 800MB/s sustained while rebuilding to a distributed spare.
+ * 64MB was observed to deliver the best performance and set as the default.
+ * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
+ * and a rebuild rate of 1.2GB/s was measured to the distribute spare.
+ * Smaller values were unable to fully saturate the available pool I/O.
*/
-static unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+static uint64_t zfs_rebuild_vdev_limit = 64 << 20;
/*
* Automatically start a pool scrub when the last active sequential resilver
@@ -134,6 +135,7 @@ static int zfs_rebuild_scrub_enabled = 1;
* For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
*/
static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg);
+static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx);
/*
* Clear the per-vdev rebuild bytes value for a vdev tree.
@@ -307,6 +309,17 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
mutex_enter(&vd->vdev_rebuild_lock);
+
+ /*
+ * Handle a second device failure if it occurs after all rebuild I/O
+ * has completed but before this sync task has been executed.
+ */
+ if (vd->vdev_rebuild_reset_wanted) {
+ mutex_exit(&vd->vdev_rebuild_lock);
+ vdev_rebuild_reset_sync(arg, tx);
+ return;
+ }
+
vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
vrp->vrp_end_time = gethrestime_sec();
@@ -558,8 +571,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vdev_rebuild_blkptr_init(&blk, vd, start, size);
uint64_t psize = BP_GET_PSIZE(&blk);
- if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+ if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
+ vr->vr_pass_bytes_skipped += size;
return (0);
+ }
mutex_enter(&vr->vr_io_lock);
@@ -741,6 +756,7 @@ vdev_rebuild_thread(void *arg)
{
vdev_t *vd = arg;
spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
int error = 0;
/*
@@ -760,7 +776,6 @@ vdev_rebuild_thread(void *arg)
ASSERT(vd->vdev_rebuilding);
ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
- ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
@@ -773,9 +788,7 @@ vdev_rebuild_thread(void *arg)
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
-
- vr->vr_bytes_inflight_max = MAX(1ULL << 20,
- zfs_rebuild_vdev_limit * vd->vdev_children);
+ vr->vr_pass_bytes_skipped = 0;
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@@ -793,6 +806,17 @@ vdev_rebuild_thread(void *arg)
vr->vr_scan_msp = msp;
/*
+ * Calculate the max number of in-flight bytes for top-level
+ * vdev scanning operations (minimum 1MB, maximum 1/2 of
+ * arc_c_max shared by all top-level vdevs). Limits for the
+ * issuing phase are done per top-level vdev and are handled
+ * separately.
+ */
+ uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1);
+ vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
+ zfs_rebuild_vdev_limit * vd->vdev_children));
+
+ /*
* Removal of vdevs from the vdev tree may eliminate the need
* for the rebuild, in which case it should be canceled. The
* vdev_rebuild_cancel_wanted flag is set until the sync task
@@ -1132,16 +1156,17 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
vr->vr_pass_start_time);
vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+ vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
mutex_exit(&tvd->vdev_rebuild_lock);
}
return (error);
}
-ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW,
"Max segment size in bytes of rebuild reads");
-ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW,
"Max bytes in flight per leaf vdev for sequential resilvers");
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,