diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/vdev_trim.c')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_trim.c | 134 |
1 files changed, 98 insertions, 36 deletions
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index ed98df782918..9753d5a1ea04 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,9 +20,10 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright 2023 RackTop Systems, Inc. */ #include <sys/spa.h> @@ -168,7 +169,8 @@ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } /* @@ -179,10 +181,31 @@ vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || + tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } /* + * Wait for given number of kicks, return true if the wait is aborted due to + * vdev_autotrim_exit_wanted. + */ +static boolean_t +vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) +{ + mutex_enter(&vd->vdev_autotrim_lock); + for (int i = 0; i < num_of_kick; i++) { + if (vd->vdev_autotrim_exit_wanted) + break; + cv_wait_idle(&vd->vdev_autotrim_kick_cv, + &vd->vdev_autotrim_lock); + } + boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; + mutex_exit(&vd->vdev_autotrim_lock); + + return (exit_wanted); +} + +/* * The sync task for updating the on-disk state of a manual TRIM. This * is scheduled by vdev_trim_change_state(). */ @@ -202,7 +225,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; @@ -572,6 +596,7 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; + int error = 0; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; @@ -591,19 +616,32 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { - int error; - error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { - return (error); + goto done; } } } - return (0); +done: + /* + * Make sure all TRIMs for this metaslab have completed before + * returning. TRIM zios have lower priority over regular or syncing + * zios, so all TRIM zios for this metaslab must complete before the + * metaslab is re-enabled. Otherwise it's possible write zios to + * this metaslab could cut ahead of still queued TRIM zios for this + * metaslab causing corruption if the ranges overlap. + */ + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[0] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + return (error); } static void @@ -922,11 +960,6 @@ vdev_trim_thread(void *arg) } spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_enter(&vd->vdev_trim_io_lock); - while (vd->vdev_trim_inflight[0] > 0) { - cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); - } - mutex_exit(&vd->vdev_trim_io_lock); range_tree_destroy(ta.trim_tree); @@ -976,6 +1009,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, @@ -1114,7 +1148,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) void vdev_trim_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -1133,12 +1168,13 @@ vdev_trim_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; - if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, @@ -1188,12 +1224,10 @@ vdev_autotrim_thread(void *arg) mutex_exit(&vd->vdev_autotrim_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; - uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; - while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); - boolean_t issued_trim = B_FALSE; + uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; + uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; /* * All of the metaslabs are divided in to groups of size @@ -1225,6 +1259,8 @@ vdev_autotrim_thread(void *arg) i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; range_tree_t *trim_tree; + boolean_t issued_trim = B_FALSE; + boolean_t wait_aborted = B_FALSE; spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); @@ -1375,7 +1411,18 @@ vdev_autotrim_thread(void *arg) range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); - metaslab_enable(msp, issued_trim, B_FALSE); + /* + * Wait for couples of kicks, to ensure the trim io is + * synced. If the wait is aborted due to + * vdev_autotrim_exit_wanted, we need to signal + * metaslab_enable() to wait for sync. + */ + if (issued_trim) { + wait_aborted = vdev_autotrim_wait_kick(vd, + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE); + } + + metaslab_enable(msp, wait_aborted, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { @@ -1389,17 +1436,14 @@ vdev_autotrim_thread(void *arg) } kmem_free(tap, sizeof (trim_args_t) * children); + + if (vdev_autotrim_should_stop(vd)) + break; } spa_config_exit(spa, SCL_CONFIG, FTAG); - /* - * After completing the group of metaslabs wait for the next - * open txg. This is done to make sure that a minimum of - * zfs_trim_txg_batch txgs will occur before these metaslabs - * are trimmed again. - */ - txg_wait_open(spa_get_dsl(spa), 0, issued_trim); + vdev_autotrim_wait_kick(vd, 1); shift++; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1455,7 +1499,8 @@ vdev_autotrim(spa_t *spa) mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && - tvd->vdev_autotrim_thread == NULL) { + tvd->vdev_autotrim_thread == NULL && + !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, @@ -1477,11 +1522,9 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) { tvd->vdev_autotrim_exit_wanted = B_TRUE; - - while (tvd->vdev_autotrim_thread != NULL) { - cv_wait(&tvd->vdev_autotrim_cv, - &tvd->vdev_autotrim_lock); - } + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + cv_wait(&tvd->vdev_autotrim_cv, + &tvd->vdev_autotrim_lock); ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); tvd->vdev_autotrim_exit_wanted = B_FALSE; @@ -1489,6 +1532,24 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_exit(&tvd->vdev_autotrim_lock); } +void +vdev_autotrim_kick(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + vdev_t *root_vd = spa->spa_root_vdev; + vdev_t *tvd; + + for (uint64_t i = 0; i < root_vd->vdev_children; i++) { + tvd = root_vd->vdev_child[i]; + + mutex_enter(&tvd->vdev_autotrim_lock); + if (tvd->vdev_autotrim_thread != NULL) + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + mutex_exit(&tvd->vdev_autotrim_lock); + } +} + /* * Wait for all of the vdev_autotrim_thread associated with the pool to * be terminated (canceled or stopped). @@ -1508,8 +1569,8 @@ vdev_autotrim_stop_all(spa_t *spa) void vdev_autotrim_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } @@ -1664,6 +1725,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); |