aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/spa.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/spa.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c100
1 files changed, 63 insertions, 37 deletions
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index c3800e018c73..638572996c3a 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -34,6 +34,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2024, Klara Inc.
*/
/*
@@ -170,14 +171,19 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* that scales with the number of CPUs.
*
* The different taskq priorities are to handle the different contexts (issue
- * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
- * need to be handled with minimum delay.
+ * and interrupt) and then to reserve threads for high priority I/Os that
+ * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT
+ * implementation, so separate high priority threads are used there.
*/
static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
+#ifdef illumos
{ ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
+#else
+ { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */
+#endif
{ ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */
@@ -1217,7 +1223,7 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
*
* Example (the defaults for READ and WRITE)
* zio_taskq_read='fixed,1,8 null scale null'
- * zio_taskq_write='sync fixed,1,5 scale fixed,1,5'
+ * zio_taskq_write='sync null scale null'
*
* Each sets the entire row at a time.
*
@@ -1484,9 +1490,9 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
* Note that a type may have multiple discrete taskqs to avoid lock contention
* on the taskq itself.
*/
-static taskq_t *
-spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- zio_t *zio)
+void
+spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, zio_t *zio, boolean_t cutinline)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;
@@ -1494,37 +1500,25 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+ ASSERT(zio);
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
- (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
+ ZIO_HAS_ALLOCATOR(zio)) {
tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
- return (tq);
-}
-
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
- zio_t *zio)
-{
- taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
- taskq_dispatch_ent(tq, func, arg, flags, ent);
-}
-/*
- * Same as spa_taskq_dispatch_ent() but block on the task until completion.
- */
-void
-spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- task_func_t *func, void *arg, uint_t flags)
-{
- taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
- taskqid_t id = taskq_dispatch(tq, func, arg, flags);
- if (id)
- taskq_wait_id(tq, id);
+ taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
+ &zio->io_tqent);
}
static void
@@ -1986,7 +1980,8 @@ spa_destroy_aux_threads(spa_t *spa)
static void
spa_unload(spa_t *spa)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
spa_import_progress_remove(spa_guid(spa));
@@ -6950,7 +6945,7 @@ static int
spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
- int error;
+ int error = 0;
spa_t *spa;
hrtime_t export_start = gethrtime();
@@ -6974,8 +6969,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa->spa_is_exporting = B_TRUE;
/*
- * Put a hold on the pool, drop the namespace lock, stop async tasks,
- * reacquire the namespace lock, and see if we can export.
+ * Put a hold on the pool, drop the namespace lock, stop async tasks
+ * and see if we can export.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -6985,10 +6980,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
+ spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
- if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+ mutex_exit(&spa_namespace_lock);
goto export_spa;
+ }
+
/*
* The pool will be in core if it's openable, in which case we can
* modify its state. Objsets may be open only because they're dirty,
@@ -7009,6 +7008,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
goto fail;
}
+ mutex_exit(&spa_namespace_lock);
+ /*
+ * At this point we no longer hold the spa_namespace_lock and
+ * there were no references on the spa. Future spa_lookups will
+ * notice the spa->spa_export_thread and wait until we signal
+ * that we are finshed.
+ */
+
if (spa->spa_sync_on) {
vdev_t *rvd = spa->spa_root_vdev;
/*
@@ -7020,6 +7027,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (!force && new_state == POOL_STATE_EXPORTED &&
spa_has_active_shared_spare(spa)) {
error = SET_ERROR(EXDEV);
+ mutex_enter(&spa_namespace_lock);
goto fail;
}
@@ -7084,6 +7092,13 @@ export_spa:
if (oldconfig && spa->spa_config)
*oldconfig = fnvlist_dup(spa->spa_config);
+ if (new_state == POOL_STATE_EXPORTED)
+ zio_handle_export_delay(spa, gethrtime() - export_start);
+
+ /*
+ * Take the namespace lock for the actual spa_t removal
+ */
+ mutex_enter(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
@@ -7095,17 +7110,25 @@ export_spa:
* we make sure to reset the exporting flag.
*/
spa->spa_is_exporting = B_FALSE;
+ spa->spa_export_thread = NULL;
}
- if (new_state == POOL_STATE_EXPORTED)
- zio_handle_export_delay(spa, gethrtime() - export_start);
-
+ /*
+ * Wake up any waiters in spa_lookup()
+ */
+ cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (0);
fail:
spa->spa_is_exporting = B_FALSE;
+ spa->spa_export_thread = NULL;
+
spa_async_resume(spa);
+ /*
+ * Wake up any waiters in spa_lookup()
+ */
+ cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (error);
}
@@ -10145,6 +10168,9 @@ spa_sync(spa_t *spa, uint64_t txg)
metaslab_class_evict_old(spa->spa_normal_class, txg);
metaslab_class_evict_old(spa->spa_log_class, txg);
+ /* spa_embedded_log_class has only one metaslab per vdev. */
+ metaslab_class_evict_old(spa->spa_special_class, txg);
+ metaslab_class_evict_old(spa->spa_dedup_class, txg);
spa_sync_close_syncing_log_sm(spa);