aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/spa.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/spa.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c194
1 files changed, 182 insertions, 12 deletions
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 6b52c6cb1f9e..5ecb175fbd63 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -100,6 +100,7 @@
#include <sys/vmsystm.h>
#endif /* _KERNEL */
+#include "zfs_crrd.h"
#include "zfs_prop.h"
#include "zfs_comutil.h"
#include <cityhash.h>
@@ -311,6 +312,41 @@ static int zfs_livelist_condense_zthr_cancel = 0;
static int zfs_livelist_condense_new_alloc = 0;
/*
+ * Time variable to decide how often the txg should be added into the
+ * database (in seconds).
+ * The smallest available resolution is in minutes, which means an update occurs
+ * each time we reach `spa_note_txg_time` and the txg has changed. We provide
+ * a 256-slot ring buffer for minute-level resolution. The number is limited by
+ * the size of the structure we use and the maximum amount of bytes we can write
+ * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately
+ * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of
+ * high-resolution data.
+ *
+ * The user can decrease `spa_note_txg_time` to increase resolution within
+ * a day, at the cost of retaining fewer days of data. Alternatively, increasing
+ * the interval allows storing data over a longer period, but with lower
+ * frequency.
+ *
+ * This parameter does not affect the daily or monthly databases, as those only
+ * store one record per day and per month, respectively.
+ */
+static uint_t spa_note_txg_time = 10 * 60;
+
+/*
+ * How often flush txg database to a disk (in seconds).
+ * We flush data every time we write to it, making it the most reliable option.
+ * Since this happens every 10 minutes, it shouldn't introduce any noticeable
+ * overhead for the system. In case of failure, we will always have an
+ * up-to-date version of the database.
+ *
+ * The user can adjust the flush interval to a lower value, but it probably
+ * doesn't make sense to flush more often than the database is updated.
+ * The user can also increase the interval if they're concerned about the
+ * performance of writing the entire database to disk.
+ */
+static uint_t spa_flush_txg_time = 10 * 60;
+
+/*
* ==========================================================================
* SPA properties routines
* ==========================================================================
@@ -417,11 +453,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv)
alloc += metaslab_class_get_alloc(spa_special_class(spa));
alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
+ alloc += metaslab_class_get_alloc(
+ spa_special_embedded_log_class(spa));
size = metaslab_class_get_space(mc);
size += metaslab_class_get_space(spa_special_class(spa));
size += metaslab_class_get_space(spa_dedup_class(spa));
size += metaslab_class_get_space(spa_embedded_log_class(spa));
+ size += metaslab_class_get_space(
+ spa_special_embedded_log_class(spa));
spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
@@ -1679,6 +1719,8 @@ spa_activate(spa_t *spa, spa_mode_t mode)
"embedded_log", msp, B_TRUE);
spa->spa_special_class = metaslab_class_create(spa, "special",
msp, B_FALSE);
+ spa->spa_special_embedded_log_class = metaslab_class_create(spa,
+ "special_embedded_log", msp, B_TRUE);
spa->spa_dedup_class = metaslab_class_create(spa, "dedup",
msp, B_FALSE);
@@ -1853,6 +1895,9 @@ spa_deactivate(spa_t *spa)
metaslab_class_destroy(spa->spa_special_class);
spa->spa_special_class = NULL;
+ metaslab_class_destroy(spa->spa_special_embedded_log_class);
+ spa->spa_special_embedded_log_class = NULL;
+
metaslab_class_destroy(spa->spa_dedup_class);
spa->spa_dedup_class = NULL;
@@ -2031,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa)
}
}
+static void
+spa_sync_time_logger(spa_t *spa, uint64_t txg)
+{
+ uint64_t curtime;
+ dmu_tx_t *tx;
+
+ if (!spa_writeable(spa)) {
+ return;
+ }
+ curtime = gethrestime_sec();
+ if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) {
+ return;
+ }
+
+ if (txg > spa->spa_last_noted_txg) {
+ spa->spa_last_noted_txg_time = curtime;
+ spa->spa_last_noted_txg = txg;
+
+ mutex_enter(&spa->spa_txg_log_time_lock);
+ dbrrd_add(&spa->spa_txg_log_time, curtime, txg);
+ mutex_exit(&spa->spa_txg_log_time_lock);
+ }
+
+ if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) {
+ return;
+ }
+ spa->spa_last_flush_txg_time = curtime;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_minutes, tx));
+ VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_days, tx));
+ VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_months, tx));
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_unload_sync_time_logger(spa_t *spa)
+{
+ uint64_t txg;
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ spa->spa_last_noted_txg_time = 0;
+ spa->spa_last_flush_txg_time = 0;
+ spa_sync_time_logger(spa, txg);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_load_txg_log_time(spa_t *spa)
+{
+ int error;
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_minutes);
+ if (error != 0 && error != ENOENT) {
+ spa_load_note(spa, "unable to load a txg time database with "
+ "minute resolution [error=%d]", error);
+ }
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_days);
+ if (error != 0 && error != ENOENT) {
+ spa_load_note(spa, "unable to load a txg time database with "
+ "day resolution [error=%d]", error);
+ }
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_months);
+ if (error != 0 && error != ENOENT) {
+ spa_load_note(spa, "unable to load a txg time database with "
+ "month resolution [error=%d]", error);
+ }
+}
+
+static boolean_t
+spa_should_sync_time_logger_on_unload(spa_t *spa)
+{
+
+ if (!spa_writeable(spa))
+ return (B_FALSE);
+
+ if (!spa->spa_sync_on)
+ return (B_FALSE);
+
+ if (spa_state(spa) != POOL_STATE_EXPORTED)
+ return (B_FALSE);
+
+ if (spa->spa_last_noted_txg == 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+
/*
* Opposite of spa_load().
*/
@@ -2052,6 +2202,9 @@ spa_unload(spa_t *spa)
* we delay the final TXGs beyond what spa_final_txg is set at.
*/
if (spa->spa_final_txg == UINT64_MAX) {
+ if (spa_should_sync_time_logger_on_unload(spa))
+ spa_unload_sync_time_logger(spa);
+
/*
* If the log space map feature is enabled and the pool is
* getting exported (but not destroyed), we want to spend some
@@ -2709,8 +2862,8 @@ spa_claim_notify(zio_t *zio)
return;
mutex_enter(&spa->spa_props_lock); /* any mutex will do */
- if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
- spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
+ if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp))
+ spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp);
mutex_exit(&spa->spa_props_lock);
}
@@ -3768,20 +3921,17 @@ out:
* ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
*/
if (error == EREMOTEIO) {
- const char *hostname = "<unknown>";
- uint64_t hostid = 0;
-
if (mmp_label) {
if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
- hostname = fnvlist_lookup_string(mmp_label,
- ZPOOL_CONFIG_HOSTNAME);
+ const char *hostname = fnvlist_lookup_string(
+ mmp_label, ZPOOL_CONFIG_HOSTNAME);
fnvlist_add_string(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
}
if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
- hostid = fnvlist_lookup_uint64(mmp_label,
- ZPOOL_CONFIG_HOSTID);
+ uint64_t hostid = fnvlist_lookup_uint64(
+ mmp_label, ZPOOL_CONFIG_HOSTID);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTID, hostid);
}
@@ -4711,6 +4861,9 @@ spa_ld_get_props(spa_t *spa)
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ /* Load time log */
+ spa_load_txg_log_time(spa);
+
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
@@ -5899,7 +6052,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
}
if (firstopen)
- zvol_create_minors_recursive(spa_name(spa));
+ zvol_create_minors(spa_name(spa));
*spapp = spa;
@@ -6877,7 +7030,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
mutex_exit(&spa_namespace_lock);
- zvol_create_minors_recursive(pool);
+ zvol_create_minors(pool);
spa_import_os(spa);
@@ -7134,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa_config_exit(spa, SCL_ALL, FTAG);
}
+ if (spa_should_sync_time_logger_on_unload(spa))
+ spa_unload_sync_time_logger(spa);
+
/*
* If the log space map feature is enabled and the pool is
* getting exported (but not destroyed), we want to spend some
@@ -9092,6 +9248,8 @@ spa_async_thread(void *arg)
old_space += metaslab_class_get_space(spa_dedup_class(spa));
old_space += metaslab_class_get_space(
spa_embedded_log_class(spa));
+ old_space += metaslab_class_get_space(
+ spa_special_embedded_log_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
@@ -9100,6 +9258,8 @@ spa_async_thread(void *arg)
new_space += metaslab_class_get_space(spa_dedup_class(spa));
new_space += metaslab_class_get_space(
spa_embedded_log_class(spa));
+ new_space += metaslab_class_get_space(
+ spa_special_embedded_log_class(spa));
mutex_exit(&spa_namespace_lock);
/*
@@ -10180,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg)
*/
brt_pending_apply(spa, txg);
+ spa_sync_time_logger(spa, txg);
+
/*
* Lock out configuration changes.
*/
@@ -10222,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg)
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
spa->spa_sync_starttime = gethrtime();
+
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@@ -10309,7 +10472,7 @@ spa_sync(spa_t *spa, uint64_t txg)
metaslab_class_evict_old(spa->spa_normal_class, txg);
metaslab_class_evict_old(spa->spa_log_class, txg);
- /* spa_embedded_log_class has only one metaslab per vdev. */
+ /* Embedded log classes have only one metaslab per vdev. */
metaslab_class_evict_old(spa->spa_special_class, txg);
metaslab_class_evict_old(spa->spa_dedup_class, txg);
@@ -11095,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
+ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW,
+ "How frequently TXG timestamps are stored internally (in seconds)");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW,
+ "How frequently the TXG timestamps database should be flushed "
+ "to disk (in seconds)");
+
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,