aboutsummaryrefslogtreecommitdiff
path: root/module/zfs
diff options
context:
space:
mode:
authorMatt Macy <mmacy@FreeBSD.org>2020-09-18 22:55:05 +0000
committerMatt Macy <mmacy@FreeBSD.org>2020-09-18 22:55:05 +0000
commit04bab0082226f7e72c41ad528298c43edc3fa316 (patch)
tree67433d716d54ef1de3f4b31a01227792c48506c2 /module/zfs
parentb0a96e5e2d3c9480ec89dd4c034c7fe4f97abfe1 (diff)
downloadsrc-04bab0082226f7e72c41ad528298c43edc3fa316.tar.gz
src-04bab0082226f7e72c41ad528298c43edc3fa316.zip
Update openzfs to 2.0.0-rc2-g4ce06fvendor/openzfs/2.0-rc2-g4ce06f
Notes
Notes: svn path=/vendor-sys/openzfs/dist/; revision=365892 svn path=/vendor-sys/openzfs/2.0-rc2-g4ce06f/; revision=365893; tag=vendor/openzfs/2.0-rc2-g4ce06f
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c30
-rw-r--r--module/zfs/dbuf.c2
-rw-r--r--module/zfs/dmu_redact.c33
-rw-r--r--module/zfs/dnode.c25
-rw-r--r--module/zfs/dsl_scan.c1
-rw-r--r--module/zfs/dsl_synctask.c16
-rw-r--r--module/zfs/fm.c14
-rw-r--r--module/zfs/metaslab.c1
-rw-r--r--module/zfs/mmp.c22
-rw-r--r--module/zfs/range_tree.c1
-rw-r--r--module/zfs/spa.c42
-rw-r--r--module/zfs/spa_config.c8
-rw-r--r--module/zfs/spa_history.c5
-rw-r--r--module/zfs/txg.c12
-rw-r--r--module/zfs/vdev.c103
-rw-r--r--module/zfs/vdev_indirect.c16
-rw-r--r--module/zfs/vdev_initialize.c5
-rw-r--r--module/zfs/vdev_label.c114
-rw-r--r--module/zfs/vdev_mirror.c2
-rw-r--r--module/zfs/vdev_raidz.c35
-rw-r--r--module/zfs/vdev_rebuild.c14
-rw-r--r--module/zfs/vdev_removal.c9
-rw-r--r--module/zfs/vdev_trim.c7
-rw-r--r--module/zfs/zfs_fm.c361
-rw-r--r--module/zfs/zfs_ioctl.c23
-rw-r--r--module/zfs/zio.c42
-rw-r--r--module/zfs/zthr.c20
27 files changed, 714 insertions, 249 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 904c325f37a1..7a499298f75c 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2019, Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020, Delphix. All rights reserved.
* Copyright (c) 2014, Saso Kiselkov. All rights reserved.
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
@@ -895,6 +895,12 @@ static void l2arc_read_done(zio_t *);
static void l2arc_do_free_on_write(void);
/*
+ * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
+ * metadata and data are cached from ARC into L2ARC.
+ */
+int l2arc_mfuonly = 0;
+
+/*
* L2ARC TRIM
* l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
* the current write size (l2arc_write_max) we should TRIM if we
@@ -2188,7 +2194,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
ret = SET_ERROR(EIO);
spa_log_error(spa, zb);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
- spa, NULL, zb, NULL, 0, 0);
+ spa, NULL, zb, NULL, 0);
}
return (ret);
@@ -5654,7 +5660,7 @@ arc_read_done(zio_t *zio)
spa_log_error(zio->io_spa, &acb->acb_zb);
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
- zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
+ zio->io_spa, NULL, &acb->acb_zb, zio, 0);
}
}
@@ -5931,7 +5937,7 @@ top:
spa_log_error(spa, zb);
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
- spa, NULL, zb, NULL, 0, 0);
+ spa, NULL, zb, NULL, 0);
}
}
if (rc != 0) {
@@ -8909,6 +8915,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* Copy buffers for L2ARC writing.
*/
for (int try = 0; try < L2ARC_FEED_TYPES; try++) {
+ /*
+ * If try == 1 or 3, we cache MRU metadata and data
+ * respectively.
+ */
+ if (l2arc_mfuonly) {
+ if (try == 1 || try == 3)
+ continue;
+ }
+
multilist_sublist_t *mls = l2arc_sublist_lock(try);
uint64_t passed_sz = 0;
@@ -9174,7 +9189,7 @@ l2arc_feed_thread(void *unused)
cookie = spl_fstrans_mark();
while (l2arc_thread_exit == 0) {
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
+ (void) cv_timedwait_idle(&l2arc_feed_thr_cv,
&l2arc_feed_thr_lock, next);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
next = ddi_get_lbolt() + hz;
@@ -9291,8 +9306,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
ASSERT(!l2arc_vdev_present(vd));
- vdev_ashift_optimize(vd);
-
/*
* Create a new l2arc device entry.
*/
@@ -10562,6 +10575,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
"Min size in bytes to write rebuild log blocks in L2ARC");
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
+ "Cache only MFU data from ARC into L2ARC");
+
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 2de1f4e4c267..7d817320aae4 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -718,7 +718,7 @@ dbuf_evict_thread(void *unused)
while (!dbuf_evict_thread_exit) {
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_sig_hires(&dbuf_evict_cv,
+ (void) cv_timedwait_idle_hires(&dbuf_evict_cv,
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
}
diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index df10d8d6faae..c53fba75cc51 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c
@@ -568,8 +568,7 @@ commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object,
uint64_t txg = dmu_tx_get_txg(tx);
if (!md->md_synctask_txg[txg & TXG_MASK]) {
dsl_sync_task_nowait(dmu_tx_pool(tx),
- redaction_list_update_sync, md, 5, ZFS_SPACE_CHECK_NONE,
- tx);
+ redaction_list_update_sync, md, tx);
md->md_synctask_txg[txg & TXG_MASK] = B_TRUE;
md->md_latest_synctask_txg = txg;
}
@@ -1007,10 +1006,14 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
objset_t *os;
struct redact_thread_arg *args = NULL;
redaction_list_t *new_rl = NULL;
+ char *newredactbook;
if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0)
return (err);
+ newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN,
+ KM_SLEEP);
+
if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT,
FTAG, &ds)) != 0) {
goto out;
@@ -1064,7 +1067,6 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
goto out;
boolean_t resuming = B_FALSE;
- char newredactbook[ZFS_MAX_DATASET_NAME_LEN];
zfs_bookmark_phys_t bookmark;
(void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN);
@@ -1074,6 +1076,10 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
"#%s", redactbook);
if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) {
dsl_pool_rele(dp, FTAG);
+ kmem_free(newredactbook,
+ sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
+ if (args != NULL)
+ kmem_free(args, numsnaps * sizeof (*args));
return (SET_ERROR(ENAMETOOLONG));
}
err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
@@ -1146,16 +1152,23 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
(void) thread_create(NULL, 0, redact_traverse_thread, rta,
0, curproc, TS_RUN, minclsyspri);
}
- struct redact_merge_thread_arg rmta = { { {0} } };
- (void) bqueue_init(&rmta.q, zfs_redact_queue_ff,
+
+ struct redact_merge_thread_arg *rmta;
+ rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP);
+
+ (void) bqueue_init(&rmta->q, zfs_redact_queue_ff,
zfs_redact_queue_length, offsetof(struct redact_record, ln));
- rmta.numsnaps = numsnaps;
- rmta.spa = os->os_spa;
- rmta.thr_args = args;
- (void) thread_create(NULL, 0, redact_merge_thread, &rmta, 0, curproc,
+ rmta->numsnaps = numsnaps;
+ rmta->spa = os->os_spa;
+ rmta->thr_args = args;
+ (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,
TS_RUN, minclsyspri);
- err = perform_redaction(os, new_rl, &rmta);
+ err = perform_redaction(os, new_rl, rmta);
+ kmem_free(rmta, sizeof (struct redact_merge_thread_arg));
+
out:
+ kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
+
if (new_rl != NULL) {
dsl_redaction_list_long_rele(new_rl, FTAG);
dsl_redaction_list_rele(new_rl, FTAG);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 00536f2774e7..30d20bfefa12 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1197,7 +1197,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
dnode_t *dn;
zrl_init(&dnh->dnh_zrlock);
- zrl_tryenter(&dnh->dnh_zrlock);
+ VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
dn = dnode_create(os, dnp, NULL, object, dnh);
DNODE_VERIFY(dn);
@@ -1949,18 +1949,20 @@ static void
dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
dmu_tx_t *tx)
{
- dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db_search;
dmu_buf_impl_t *db;
avl_index_t where;
+ db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
mutex_enter(&dn->dn_dbufs_mtx);
- db_search.db_level = 1;
- db_search.db_blkid = start_blkid + 1;
- db_search.db_state = DB_SEARCH;
+ db_search->db_level = 1;
+ db_search->db_blkid = start_blkid + 1;
+ db_search->db_state = DB_SEARCH;
for (;;) {
- db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
@@ -1972,7 +1974,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
/*
* Setup the next blkid we want to search for.
*/
- db_search.db_blkid = db->db_blkid + 1;
+ db_search->db_blkid = db->db_blkid + 1;
ASSERT3U(db->db_blkid, >=, start_blkid);
/*
@@ -1992,10 +1994,10 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
/*
* Walk all the in-core level-1 dbufs and verify they have been dirtied.
*/
- db_search.db_level = 1;
- db_search.db_blkid = start_blkid + 1;
- db_search.db_state = DB_SEARCH;
- db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ db_search->db_level = 1;
+ db_search->db_blkid = start_blkid + 1;
+ db_search->db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
@@ -2005,6 +2007,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
ASSERT(db->db_dirtycnt > 0);
}
#endif
+ kmem_free(db_search, sizeof (dmu_buf_impl_t));
mutex_exit(&dn->dn_dbufs_mtx);
}
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 712af664e90f..0ebda2f77074 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -23,6 +23,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills
* Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
* Copyright 2019 Joyent, Inc.
*/
diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c
index 2d6ca8549eb9..148e8fff2437 100644
--- a/module/zfs/dsl_synctask.c
+++ b/module/zfs/dsl_synctask.c
@@ -170,15 +170,13 @@ dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
static void
dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
- boolean_t early)
+ dmu_tx_t *tx, boolean_t early)
{
dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
dst->dst_pool = dp;
dst->dst_txg = dmu_tx_get_txg(tx);
- dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
- dst->dst_space_check = space_check;
+ dst->dst_space_check = ZFS_SPACE_CHECK_NONE;
dst->dst_checkfunc = dsl_null_checkfunc;
dst->dst_syncfunc = syncfunc;
dst->dst_arg = arg;
@@ -192,18 +190,16 @@ dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
void
dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
- dsl_sync_task_nowait_common(dp, syncfunc, arg,
- blocks_modified, space_check, tx, B_FALSE);
+ dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE);
}
void
dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
- dsl_sync_task_nowait_common(dp, syncfunc, arg,
- blocks_modified, space_check, tx, B_TRUE);
+ dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE);
}
/*
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index c00e08b8d02a..a5003f85d621 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -104,13 +104,15 @@ struct erpt_kstat {
kstat_named_t erpt_set_failed; /* num erpt set failures */
kstat_named_t fmri_set_failed; /* num fmri set failures */
kstat_named_t payload_set_failed; /* num payload set failures */
+ kstat_named_t erpt_duplicates; /* num duplicate erpts */
};
static struct erpt_kstat erpt_kstat_data = {
{ "erpt-dropped", KSTAT_DATA_UINT64 },
{ "erpt-set-failed", KSTAT_DATA_UINT64 },
{ "fmri-set-failed", KSTAT_DATA_UINT64 },
- { "payload-set-failed", KSTAT_DATA_UINT64 }
+ { "payload-set-failed", KSTAT_DATA_UINT64 },
+ { "erpt-duplicates", KSTAT_DATA_UINT64 }
};
kstat_t *fm_ksp;
@@ -568,6 +570,12 @@ out:
return (error);
}
+void
+zfs_zevent_track_duplicate(void)
+{
+ atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
+}
+
static int
zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
{
@@ -1633,6 +1641,8 @@ fm_init(void)
list_create(&zevent_list, sizeof (zevent_t),
offsetof(zevent_t, ev_node));
cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
+
+ zfs_ereport_init();
}
void
@@ -1640,6 +1650,8 @@ fm_fini(void)
{
int count;
+ zfs_ereport_fini();
+
zfs_zevent_drain_all(&count);
mutex_enter(&zevent_lock);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index ccc247d1557a..133005b227e5 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
*/
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 4170d7e03ebd..99852521b6d1 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -198,14 +198,6 @@ mmp_init(spa_t *spa)
cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
mmp->mmp_kstat_id = 1;
-
- /*
- * mmp_write_done() calculates mmp_delay based on prior mmp_delay and
- * the elapsed time since the last write. For the first mmp write,
- * there is no "last write", so we start with fake non-zero values.
- */
- mmp->mmp_last_write = gethrtime();
- mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
}
void
@@ -557,6 +549,18 @@ mmp_thread(void *arg)
mmp_thread_enter(mmp, &cpr);
+ /*
+ * There have been no MMP writes yet. Setting mmp_last_write here gives
+ * us one mmp_fail_ns period, which is consistent with the activity
+ * check duration, to try to land an MMP write before MMP suspends the
+ * pool (if so configured).
+ */
+
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_last_write = gethrtime();
+ mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
+ mutex_exit(&mmp->mmp_io_lock);
+
while (!mmp->mmp_thread_exiting) {
hrtime_t next_time = gethrtime() +
MSEC2NSEC(MMP_DEFAULT_INTERVAL);
@@ -671,7 +675,7 @@ mmp_thread(void *arg)
}
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,
+ (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,
&mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
CALLOUT_FLAG_ABSOLUTE);
CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 2c0e4b860a04..2ce0139c9137 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -24,6 +24,7 @@
*/
/*
* Copyright (c) 2013, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index aac469f44b59..532f04b91ca1 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -1000,13 +1000,25 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
/*
* The write issue taskq can be extremely CPU
* intensive. Run it at slightly less important
- * priority than the other taskqs. Under Linux this
- * means incrementing the priority value on platforms
- * like illumos it should be decremented.
+ * priority than the other taskqs.
+ *
+ * Under Linux and FreeBSD this means incrementing
+ * the priority value as opposed to platforms like
+ * illumos where it should be decremented.
+ *
+ * On FreeBSD, if priorities divided by four (RQ_PPQ)
+ * are equal then a difference between them is
+ * insignificant.
*/
- if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
+#if defined(__linux__)
pri++;
-
+#elif defined(__FreeBSD__)
+ pri += 4;
+#else
+#error "unknown OS"
+#endif
+ }
tq = taskq_create_proc(name, value, pri, 50,
INT_MAX, spa->spa_proc, flags);
}
@@ -2485,11 +2497,12 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)
VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
VERIFY0(zap_count(mos, ll_obj, &count));
if (count > 0) {
- dsl_deadlist_t ll = { 0 };
+ dsl_deadlist_t *ll;
dsl_deadlist_entry_t *dle;
bplist_t to_free;
- dsl_deadlist_open(&ll, mos, ll_obj);
- dle = dsl_deadlist_first(&ll);
+ ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
+ dsl_deadlist_open(ll, mos, ll_obj);
+ dle = dsl_deadlist_first(ll);
ASSERT3P(dle, !=, NULL);
bplist_create(&to_free);
int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
@@ -2497,7 +2510,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)
if (err == 0) {
sublist_delete_arg_t sync_arg = {
.spa = spa,
- .ll = &ll,
+ .ll = ll,
.key = dle->dle_mintxg,
.to_free = &to_free
};
@@ -2512,7 +2525,8 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)
}
bplist_clear(&to_free);
bplist_destroy(&to_free);
- dsl_deadlist_close(&ll);
+ dsl_deadlist_close(ll);
+ kmem_free(ll, sizeof (dsl_deadlist_t));
} else {
livelist_delete_arg_t sync_arg = {
.spa = spa,
@@ -2688,8 +2702,7 @@ spa_livelist_condense_cb(void *arg, zthr_t *t)
lca->first_size = first_size;
lca->next_size = next_size;
dsl_sync_task_nowait(spa_get_dsl(spa),
- spa_livelist_condense_sync, lca, 0,
- ZFS_SPACE_CHECK_NONE, tx);
+ spa_livelist_condense_sync, lca, tx);
dmu_tx_commit(tx);
return;
}
@@ -2869,7 +2882,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
}
if (error != EBADF) {
(void) zfs_ereport_post(ereport, spa,
- NULL, NULL, NULL, 0, 0);
+ NULL, NULL, NULL, 0);
}
}
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
@@ -5749,7 +5762,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
- vdev_ashift_optimize(vd);
vdev_metaslab_set_size(vd);
vdev_expand(vd, txg);
}
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index b98b7badbae1..dacba127dcfa 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
*/
@@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
if (target->spa_ccw_fail_time == 0) {
(void) zfs_ereport_post(
FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
- target, NULL, NULL, NULL, 0, 0);
+ target, NULL, NULL, NULL, 0);
}
target->spa_ccw_fail_time = gethrtime();
spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
@@ -577,10 +577,8 @@ spa_config_update(spa_t *spa, int what)
(tvd->vdev_islog && tvd->vdev_removing))
continue;
- if (tvd->vdev_ms_array == 0) {
- vdev_ashift_optimize(tvd);
+ if (tvd->vdev_ms_array == 0)
vdev_metaslab_set_size(tvd);
- }
vdev_expand(tvd, txg);
}
}
diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c
index f47adb94d55b..2ab58815400a 100644
--- a/module/zfs/spa_history.c
+++ b/module/zfs/spa_history.c
@@ -397,8 +397,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
/* Kick this off asynchronously; errors are ignored. */
- dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
- nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
+ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx);
dmu_tx_commit(tx);
/* spa_history_log_sync will free nvl */
@@ -532,7 +531,7 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
spa_history_log_sync(nvl, tx);
} else {
dsl_sync_task_nowait(spa_get_dsl(spa),
- spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
+ spa_history_log_sync, nvl, tx);
}
/* spa_history_log_sync() will free nvl */
}
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index a5f2b041737b..65375b579da6 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -242,16 +242,11 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
{
CALLB_CPR_SAFE_BEGIN(cpr);
- /*
- * cv_wait_sig() is used instead of cv_wait() in order to prevent
- * this process from incorrectly contributing to the system load
- * average when idle.
- */
if (time) {
- (void) cv_timedwait_sig(cv, &tx->tx_sync_lock,
+ (void) cv_timedwait_idle(cv, &tx->tx_sync_lock,
ddi_get_lbolt() + time);
} else {
- cv_wait_sig(cv, &tx->tx_sync_lock);
+ cv_wait_idle(cv, &tx->tx_sync_lock);
}
CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
@@ -760,7 +755,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
if (should_quiesce == B_TRUE) {
cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
} else {
- cv_wait_sig(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ cv_wait_idle(&tx->tx_quiesce_done_cv,
+ &tx->tx_sync_lock);
}
}
mutex_exit(&tx->tx_sync_lock);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 95a2f5947db1..a94101485c94 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio)
ASSERT(zio->io_error != 0);
vdev_dbgmsg(vd, "failed probe");
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
- spa, vd, NULL, NULL, 0, 0);
+ spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
}
@@ -1673,6 +1673,38 @@ vdev_set_deflate_ratio(vdev_t *vd)
}
/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+static void
+vdev_ashift_optimize(vdev_t *vd)
+{
+ ASSERT(vd == vd->vdev_top);
+
+ if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ vd->vdev_ashift = MIN(
+ MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
+ MAX(zfs_vdev_min_auto_ashift,
+ vd->vdev_physical_ashift));
+ } else {
+ /*
+ * If the logical and physical ashifts are the same, then
+ * we ensure that the top-level vdev's ashift is not smaller
+ * than our minimum ashift value. For the unusual case
+ * where logical ashift > physical ashift, we can't cap
+ * the calculated ashift based on max ashift as that
+ * would cause failures.
+ * We still check if we need to increase it to match
+ * the min ashift.
+ */
+ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
+ vd->vdev_ashift);
+ }
+}
+
+/*
* Prepare a virtual device for access.
*/
int
@@ -1830,16 +1862,17 @@ vdev_open(vdev_t *vd)
return (SET_ERROR(EINVAL));
}
+ /*
+ * We can always set the logical/physical ashift members since
+ * their values are only used to calculate the vdev_ashift when
+ * the device is first added to the config. These values should
+ * not be used for anything else since they may change whenever
+ * the device is reopened and we don't store them in the label.
+ */
vd->vdev_physical_ashift =
MAX(physical_ashift, vd->vdev_physical_ashift);
- vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
- vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
-
- if (vd->vdev_logical_ashift > ASHIFT_MAX) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_ASHIFT_TOO_BIG);
- return (SET_ERROR(EDOM));
- }
+ vd->vdev_logical_ashift = MAX(logical_ashift,
+ vd->vdev_logical_ashift);
if (vd->vdev_asize == 0) {
/*
@@ -1848,6 +1881,24 @@ vdev_open(vdev_t *vd)
*/
vd->vdev_asize = asize;
vd->vdev_max_asize = max_asize;
+
+ /*
+ * If the vdev_ashift was not overriden at creation time,
+ * then set it the logical ashift and optimize the ashift.
+ */
+ if (vd->vdev_ashift == 0) {
+ vd->vdev_ashift = vd->vdev_logical_ashift;
+
+ if (vd->vdev_logical_ashift > ASHIFT_MAX) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_ASHIFT_TOO_BIG);
+ return (SET_ERROR(EDOM));
+ }
+
+ if (vd->vdev_top == vd) {
+ vdev_ashift_optimize(vd);
+ }
+ }
if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
vd->vdev_ashift > ASHIFT_MAX)) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1862,11 +1913,10 @@ vdev_open(vdev_t *vd)
vd->vdev_ops->vdev_op_leaf) {
(void) zfs_ereport_post(
FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
- spa, vd, NULL, NULL, 0, 0);
+ spa, vd, NULL, NULL, 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (SET_ERROR(EDOM));
-
}
vd->vdev_max_asize = max_asize;
}
@@ -2445,35 +2495,6 @@ vdev_metaslab_set_size(vdev_t *vd)
ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
}
-/*
- * Maximize performance by inflating the configured ashift for top level
- * vdevs to be as close to the physical ashift as possible while maintaining
- * administrator defined limits and ensuring it doesn't go below the
- * logical ashift.
- */
-void
-vdev_ashift_optimize(vdev_t *vd)
-{
- if (vd == vd->vdev_top) {
- if (vd->vdev_ashift < vd->vdev_physical_ashift) {
- vd->vdev_ashift = MIN(
- MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
- MAX(zfs_vdev_min_auto_ashift,
- vd->vdev_physical_ashift));
- } else {
- /*
- * Unusual case where logical ashift > physical ashift
- * so we can't cap the calculated ashift based on max
- * ashift as that would cause failures.
- * We still check if we need to increase it to match
- * the min ashift.
- */
- vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
- vd->vdev_ashift);
- }
- }
-}
-
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
@@ -4759,7 +4780,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
}
(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
- save_state, 0);
+ save_state);
}
/* Erase any notion of persistent removed state */
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 6bc2d917d59c..12ee393bd5db 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -16,7 +16,7 @@
/*
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
- * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -576,8 +576,7 @@ spa_condense_indirect_commit_entry(spa_t *spa,
*/
if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
dsl_sync_task_nowait(dmu_tx_pool(tx),
- spa_condense_indirect_commit_sync, sci,
- 0, ZFS_SPACE_CHECK_NONE, tx);
+ spa_condense_indirect_commit_sync, sci, tx);
}
vdev_indirect_mapping_entry_t *vime =
@@ -1474,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
vdev_t *vd = ic->ic_vdev;
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
- (void) zfs_ereport_post_checksum(zio->io_spa, vd,
+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
NULL, zio, is->is_target_offset, is->is_size,
NULL, NULL, NULL);
+ if (ret != EALREADY) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
}
}
}
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index ab711441d9ca..7ff7fffcc80e 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -126,7 +126,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
- guid, 2, ZFS_SPACE_CHECK_NONE, tx);
+ guid, tx);
switch (new_state) {
case VDEV_INITIALIZE_ACTIVE:
@@ -216,8 +216,7 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
/* This is the first write of this txg. */
dsl_sync_task_nowait(spa_get_dsl(spa),
- vdev_initialize_zap_update_sync, guid, 2,
- ZFS_SPACE_CHECK_RESERVED, tx);
+ vdev_initialize_zap_update_sync, guid, tx);
}
/*
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 8c7468255565..7fab7d0d7950 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -149,6 +149,8 @@
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
+#include <sys/byteorder.h>
+#include <sys/zfs_bootenv.h>
/*
* Basic routines to read and write from a vdev label.
@@ -1233,13 +1235,9 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
* bootloader should have rewritten them all to be the same on boot,
* and any changes we made since boot have been the same across all
* labels.
- *
- * While grub supports writing to all four labels, other bootloaders
- * don't, so we only use the first two labels to store boot
- * information.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
- for (int l = 0; l < VDEV_LABELS / 2; l++) {
+ for (int l = 0; l < VDEV_LABELS; l++) {
vdev_label_read(zio, vd, l,
abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
@@ -1249,14 +1247,15 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
}
int
-vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command)
+vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
{
+ nvlist_t *config;
spa_t *spa = rvd->vdev_spa;
abd_t *abd = NULL;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
- ASSERT(command);
+ ASSERT(bootenv);
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
zio_t *zio = zio_root(spa, NULL, &abd, flags);
@@ -1264,39 +1263,81 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command)
int err = zio_wait(zio);
if (abd != NULL) {
+ char *buf;
vdev_boot_envblock_t *vbe = abd_to_buf(abd);
- if (vbe->vbe_version != VB_RAW) {
- abd_free(abd);
- return (SET_ERROR(ENOTSUP));
+
+ vbe->vbe_version = ntohll(vbe->vbe_version);
+ switch (vbe->vbe_version) {
+ case VB_RAW:
+ /*
+ * if we have textual data in vbe_bootenv, create nvlist
+ * with key "envmap".
+ */
+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);
+ vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
+ fnvlist_add_string(bootenv, GRUB_ENVMAP,
+ vbe->vbe_bootenv);
+ break;
+
+ case VB_NVLIST:
+ err = nvlist_unpack(vbe->vbe_bootenv,
+ sizeof (vbe->vbe_bootenv), &config, 0);
+ if (err == 0) {
+ fnvlist_merge(bootenv, config);
+ nvlist_free(config);
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ /* Check for FreeBSD zfs bootonce command string */
+ buf = abd_to_buf(abd);
+ if (*buf == '\0') {
+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION,
+ VB_NVLIST);
+ break;
+ }
+ fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
}
- vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
- fnvlist_add_string(command, "envmap", vbe->vbe_bootenv);
- /* abd was allocated in vdev_label_read_bootenv_impl() */
+
+ /*
+ * abd was allocated in vdev_label_read_bootenv_impl()
+ */
abd_free(abd);
- /* If we managed to read any successfully, return success. */
+ /*
+ * If we managed to read any successfully,
+ * return success.
+ */
return (0);
}
return (err);
}
int
-vdev_label_write_bootenv(vdev_t *vd, char *envmap)
+vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
{
zio_t *zio;
spa_t *spa = vd->vdev_spa;
vdev_boot_envblock_t *bootenv;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
- int error = ENXIO;
+ int error;
+ size_t nvsize;
+ char *nvbuf;
+
+ error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
+ if (error != 0)
+ return (SET_ERROR(error));
- if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) {
+ if (nvsize >= sizeof (bootenv->vbe_bootenv)) {
return (SET_ERROR(E2BIG));
}
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ error = ENXIO;
for (int c = 0; c < vd->vdev_children; c++) {
- int child_err = vdev_label_write_bootenv(vd->vdev_child[c],
- envmap);
+ int child_err;
+
+ child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);
/*
* As long as any of the disks managed to write all of their
* labels successfully, return success.
@@ -1312,16 +1353,41 @@ vdev_label_write_bootenv(vdev_t *vd, char *envmap)
ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
abd_zero(abd, VDEV_PAD_SIZE);
+
bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
+ nvbuf = bootenv->vbe_bootenv;
+ nvsize = sizeof (bootenv->vbe_bootenv);
+
+ bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
+ switch (bootenv->vbe_version) {
+ case VB_RAW:
+ if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {
+ (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);
+ }
+ error = 0;
+ break;
- char *buf = bootenv->vbe_bootenv;
- (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv));
- bootenv->vbe_version = VB_RAW;
- abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
+ case VB_NVLIST:
+ error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error == 0) {
+ bootenv->vbe_version = htonll(bootenv->vbe_version);
+ abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
+ } else {
+ abd_free(abd);
+ return (SET_ERROR(error));
+ }
retry:
zio = zio_root(spa, NULL, NULL, flags);
- for (int l = 0; l < VDEV_LABELS / 2; l++) {
+ for (int l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, abd,
offsetof(vdev_label_t, vl_be),
VDEV_PAD_SIZE, NULL, NULL, flags);
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 5e1060f127c9..71b5adbbd06a 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -391,7 +391,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
*physical_ashift = MAX(*physical_ashift,
- vd->vdev_physical_ashift);
+ cvd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 4320078b6f7c..47312e02f70a 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
*/
@@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd;
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
- (void) zfs_ereport_post_checksum(zio->io_spa, vd,
+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
&zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
rc->rc_abd, bad_data, &zbc);
+ if (ret != EALREADY) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
}
}
@@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio)
vdev_t *cvd;
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
- if (rc->rc_error == 0) {
- zio_bad_cksum_t zbc;
- zbc.zbc_has_cksum = 0;
- zbc.zbc_injected =
- rm->rm_ecksuminjected;
+ if (rc->rc_error != 0)
+ continue;
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ int ret = zfs_ereport_start_checksum(
+ zio->io_spa, cvd, &zio->io_bookmark, zio,
+ rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
+ if (ret != EALREADY) {
mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock);
-
- zfs_ereport_start_checksum(
- zio->io_spa, cvd,
- &zio->io_bookmark, zio,
- rc->rc_offset, rc->rc_size,
- (void *)(uintptr_t)c, &zbc);
}
}
}
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 85ed8afe1cf4..3362d608c037 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -267,7 +267,7 @@ vdev_rebuild_initiate(vdev_t *vd)
vd->vdev_rebuilding = B_TRUE;
dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,
- (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, tx);
dmu_tx_commit(tx);
vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);
@@ -553,8 +553,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vr->vr_scan_offset[txg & TXG_MASK] = start;
dsl_sync_task_nowait(spa_get_dsl(spa),
vdev_rebuild_update_sync,
- (void *)(uintptr_t)vd->vdev_id, 2,
- ZFS_SPACE_CHECK_RESERVED, tx);
+ (void *)(uintptr_t)vd->vdev_id, tx);
}
/* When exiting write out our progress. */
@@ -875,16 +874,14 @@ vdev_rebuild_thread(void *arg)
* by a pool checkpoint. See the dsl_scan_done() comments.
*/
dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,
- (void *)(uintptr_t)vd->vdev_id, 0,
- ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, tx);
} else if (vd->vdev_rebuild_cancel_wanted) {
/*
* The rebuild operation was canceled. This will occur when
* a device participating in the rebuild is detached.
*/
dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,
- (void *)(uintptr_t)vd->vdev_id, 0,
- ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, tx);
} else if (vd->vdev_rebuild_reset_wanted) {
/*
* Reset the running rebuild without canceling and restarting
@@ -892,8 +889,7 @@ vdev_rebuild_thread(void *arg)
* participate in the rebuild.
*/
dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,
- (void *)(uintptr_t)vd->vdev_id, 0,
- ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, tx);
} else {
/*
* The rebuild operation should be suspended. This may occur
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 56e420871f61..fdeca7ab3418 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1167,8 +1167,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
/* After this, we can not use svr. */
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
- 0, ZFS_SPACE_CHECK_NONE, tx);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ vdev_remove_complete_sync, svr, tx);
dmu_tx_commit(tx);
}
@@ -1317,7 +1317,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
- svr, 0, ZFS_SPACE_CHECK_NONE, tx);
+ svr, tx);
}
svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
@@ -2143,8 +2143,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
vdev_config_dirty(vd);
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
dsl_sync_task_nowait(spa->spa_dsl_pool,
- vdev_remove_initiate_sync,
- (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
+ vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
dmu_tx_commit(tx);
return (0);
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 3f8c34806020..02b42ddd5a6c 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -317,7 +317,7 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
- guid, 2, ZFS_SPACE_CHECK_NONE, tx);
+ guid, tx);
switch (new_state) {
case VDEV_TRIM_ACTIVE:
@@ -481,7 +481,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
if (ta->trim_type == TRIM_TYPE_MANUAL) {
while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
- cv_timedwait_sig(&vd->vdev_trim_io_cv,
+ cv_timedwait_idle(&vd->vdev_trim_io_cv,
&vd->vdev_trim_io_lock, ddi_get_lbolt() +
MSEC_TO_TICK(10));
}
@@ -510,8 +510,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
/* This is the first write of this txg. */
dsl_sync_task_nowait(spa_get_dsl(spa),
- vdev_trim_zap_update_sync, guid, 2,
- ZFS_SPACE_CHECK_RESERVED, tx);
+ vdev_trim_zap_update_sync, guid, tx);
}
/*
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index ad13ccedfc06..a8341f50ba09 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012,2020 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -101,7 +101,251 @@
* good and bad versions of the buffer (if available), and we annotate the
* ereport with information about the differences.
*/
+
#ifdef _KERNEL
+/*
+ * Duplicate ereport Detection
+ *
+ * Some ereports are retained momentarily for detecting duplicates. These
+ * are kept in a recent_events_node_t in both a time-ordered list and an AVL
+ * tree of recent unique ereports.
+ *
+ * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
+ * task is used to purge stale entries.
+ */
+static list_t recent_events_list;
+static avl_tree_t recent_events_tree;
+static kmutex_t recent_events_lock;
+static taskqid_t recent_events_cleaner_tqid;
+
+/*
+ * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
+ *
+ * This setting can be changed dynamically and setting it to zero
+ * disables duplicate detection.
+ */
+unsigned int zfs_zevent_retain_max = 2000;
+
+/*
+ * The lifespan for a recent ereport entry. The default of 15 minutes is
+ * intended to outlive the zfs diagnosis engine's threshold of 10 errors
+ * over a period of 10 minutes.
+ */
+unsigned int zfs_zevent_retain_expire_secs = 900;
+
+typedef enum zfs_subclass {
+ ZSC_IO,
+ ZSC_DATA,
+ ZSC_CHECKSUM
+} zfs_subclass_t;
+
+typedef struct {
+ /* common criteria */
+ uint64_t re_pool_guid;
+ uint64_t re_vdev_guid;
+ int re_io_error;
+ uint64_t re_io_size;
+ uint64_t re_io_offset;
+ zfs_subclass_t re_subclass;
+ zio_priority_t re_io_priority;
+
+ /* logical zio criteria (optional) */
+ zbookmark_phys_t re_io_bookmark;
+
+ /* internal state */
+ avl_node_t re_tree_link;
+ list_node_t re_list_link;
+ uint64_t re_timestamp;
+} recent_events_node_t;
+
+static int
+recent_events_compare(const void *a, const void *b)
+{
+ const recent_events_node_t *node1 = a;
+ const recent_events_node_t *node2 = b;
+ int cmp;
+
+ /*
+ * The comparison order here is somewhat arbitrary.
+ * What's important is that if every criteria matches, then it
+ * is a duplicate (i.e. compare returns 0)
+ */
+ if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
+ return (cmp);
+
+ const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
+ const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
+
+ if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
+ return (cmp);
+ if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
+ return (cmp);
+
+ return (0);
+}
+
+static void zfs_ereport_schedule_cleaner(void);
+
+/*
+ * background task to clean stale recent event nodes.
+ */
+/*ARGSUSED*/
+static void
+zfs_ereport_cleaner(void *arg)
+{
+ recent_events_node_t *entry;
+ uint64_t now = gethrtime();
+
+ /*
+ * purge expired entries
+ */
+ mutex_enter(&recent_events_lock);
+ while ((entry = list_tail(&recent_events_list)) != NULL) {
+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+ if (age <= zfs_zevent_retain_expire_secs)
+ break;
+
+ /* remove expired node */
+ avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
+ kmem_free(entry, sizeof (*entry));
+ }
+
+ /* Restart the cleaner if more entries remain */
+ recent_events_cleaner_tqid = 0;
+ if (!list_is_empty(&recent_events_list))
+ zfs_ereport_schedule_cleaner();
+
+ mutex_exit(&recent_events_lock);
+}
+
+static void
+zfs_ereport_schedule_cleaner(void)
+{
+ ASSERT(MUTEX_HELD(&recent_events_lock));
+
+ uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
+
+ recent_events_cleaner_tqid = taskq_dispatch_delay(
+ system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
+ ddi_get_lbolt() + NSEC_TO_TICK(timeout));
+}
+
+/*
+ * Check if an ereport would be a duplicate of one recently posted.
+ *
+ * An ereport is considered a duplicate if the set of criteria in
+ * recent_events_node_t all match.
+ *
+ * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
+ * are candidates for duplicate checking.
+ */
+static boolean_t
+zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
+{
+ recent_events_node_t search = {0}, *entry;
+
+ if (vd == NULL || zio == NULL)
+ return (B_FALSE);
+
+ if (zfs_zevent_retain_max == 0)
+ return (B_FALSE);
+
+ if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
+ search.re_subclass = ZSC_IO;
+ else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
+ search.re_subclass = ZSC_DATA;
+ else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
+ search.re_subclass = ZSC_CHECKSUM;
+ else
+ return (B_FALSE);
+
+ search.re_pool_guid = spa_guid(spa);
+ search.re_vdev_guid = vd->vdev_guid;
+ search.re_io_error = zio->io_error;
+ search.re_io_priority = zio->io_priority;
+ /* if size is supplied use it over what's in zio */
+ if (size) {
+ search.re_io_size = size;
+ search.re_io_offset = offset;
+ } else {
+ search.re_io_size = zio->io_size;
+ search.re_io_offset = zio->io_offset;
+ }
+
+ /* grab optional logical zio criteria */
+ if (zb != NULL) {
+ search.re_io_bookmark.zb_objset = zb->zb_objset;
+ search.re_io_bookmark.zb_object = zb->zb_object;
+ search.re_io_bookmark.zb_level = zb->zb_level;
+ search.re_io_bookmark.zb_blkid = zb->zb_blkid;
+ }
+
+ uint64_t now = gethrtime();
+
+ mutex_enter(&recent_events_lock);
+
+ /* check if we have seen this one recently */
+ entry = avl_find(&recent_events_tree, &search, NULL);
+ if (entry != NULL) {
+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+
+ /*
+ * There is still an active cleaner (since we're here).
+ * Reset the last seen time for this duplicate entry
+ * so that its lifespand gets extended.
+ */
+ list_remove(&recent_events_list, entry);
+ list_insert_head(&recent_events_list, entry);
+ entry->re_timestamp = now;
+
+ zfs_zevent_track_duplicate();
+ mutex_exit(&recent_events_lock);
+
+ return (age <= zfs_zevent_retain_expire_secs);
+ }
+
+ if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
+ /* recycle oldest node */
+ entry = list_tail(&recent_events_list);
+ ASSERT(entry != NULL);
+ list_remove(&recent_events_list, entry);
+ avl_remove(&recent_events_tree, entry);
+ } else {
+ entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
+ }
+
+ /* record this as a recent ereport */
+ *entry = search;
+ avl_add(&recent_events_tree, entry);
+ list_insert_head(&recent_events_list, entry);
+ entry->re_timestamp = now;
+
+ /* Start a cleaner if not already scheduled */
+ if (recent_events_cleaner_tqid == 0)
+ zfs_ereport_schedule_cleaner();
+
+ mutex_exit(&recent_events_lock);
+ return (B_FALSE);
+}
+
void
zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
{
@@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
uint64_t ena;
char class[64];
- if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
- return (B_FALSE);
-
if ((ereport = fm_nvlist_create(NULL)) == NULL)
return (B_FALSE);
@@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, zio->io_timestamp, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
DATA_TYPE_UINT64, zio->io_delta, NULL);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
+ DATA_TYPE_UINT32, zio->io_priority, NULL);
/*
* If the 'size' parameter is non-zero, it indicates this is a
@@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
}
/*
- * Return 0 if event was posted, EINVAL if there was a problem posting it or
- * EBUSY if the event was rate limited.
+ * Post an ereport for the given subclass
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
*/
int
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
- const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
- uint64_t size)
+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
{
int rc = 0;
#ifdef _KERNEL
nvlist_t *ereport = NULL;
nvlist_t *detector = NULL;
+ if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
+ return (EINVAL);
+
+ if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
+ return (SET_ERROR(EALREADY));
+
if (zfs_is_ratelimiting_event(subclass, vd))
return (SET_ERROR(EBUSY));
if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
- zb, zio, stateoroffset, size))
+ zb, zio, state, 0))
return (SET_ERROR(EINVAL)); /* couldn't post event */
if (ereport == NULL)
@@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
return (rc);
}
-void
+/*
+ * Prepare a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, void *arg,
zio_bad_cksum_t *info)
@@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
zio_cksum_report_t *report;
#ifdef _KERNEL
+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+ offset, length))
+ return (SET_ERROR(EALREADY));
+
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
- return;
+ return (SET_ERROR(EBUSY));
#endif
report = kmem_zalloc(sizeof (*report), KM_SLEEP);
@@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
if (report->zcr_ereport == NULL) {
zfs_ereport_free_checksum(report);
- return;
+ return (0);
}
#endif
@@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
report->zcr_next = zio->io_logical->io_cksum_report;
zio->io_logical->io_cksum_report = report;
mutex_exit(&spa->spa_errlist_lock);
+ return (0);
}
void
@@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
kmem_free(rpt, sizeof (*rpt));
}
-
+/*
+ * Post a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
int
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length,
@@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
nvlist_t *detector = NULL;
zfs_ecksum_info_t *info;
+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+ offset, length))
+ return (SET_ERROR(EALREADY));
+
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
- return (EBUSY);
+ return (SET_ERROR(EBUSY));
if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
@@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
#endif
}
-#if defined(_KERNEL)
+#ifdef _KERNEL
+void
+zfs_ereport_init(void)
+{
+ mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&recent_events_list, sizeof (recent_events_node_t),
+ offsetof(recent_events_node_t, re_list_link));
+ avl_create(&recent_events_tree, recent_events_compare,
+ sizeof (recent_events_node_t), offsetof(recent_events_node_t,
+ re_tree_link));
+}
+
+/*
+ * This 'early' fini needs to run before zfs_fini() which on Linux waits
+ * for the system_delay_taskq to drain.
+ */
+void
+zfs_ereport_taskq_fini(void)
+{
+ mutex_enter(&recent_events_lock);
+ if (recent_events_cleaner_tqid != 0) {
+ taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
+ recent_events_cleaner_tqid = 0;
+ }
+ mutex_exit(&recent_events_lock);
+}
+
+void
+zfs_ereport_fini(void)
+{
+ recent_events_node_t *entry;
+
+ while ((entry = list_head(&recent_events_list)) != NULL) {
+ avl_remove(&recent_events_tree, entry);
+ list_remove(&recent_events_list, entry);
+ kmem_free(entry, sizeof (*entry));
+ }
+ avl_destroy(&recent_events_tree);
+ list_destroy(&recent_events_list);
+ mutex_destroy(&recent_events_lock);
+}
+
EXPORT_SYMBOL(zfs_ereport_post);
EXPORT_SYMBOL(zfs_ereport_is_valid);
EXPORT_SYMBOL(zfs_ereport_post_checksum);
EXPORT_SYMBOL(zfs_post_remove);
EXPORT_SYMBOL(zfs_post_autoreplace);
EXPORT_SYMBOL(zfs_post_state_change);
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
+ "Maximum recent zevents records to retain for duplicate checking");
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
+ "Expiration time for recent zevents records");
#endif /* _KERNEL */
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 495ff4707d77..eff66b32fcb1 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3511,30 +3511,29 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
/*
* This ioctl is used to set the bootenv configuration on the current
* pool. This configuration is stored in the second padding area of the label,
- * and it is used by the GRUB bootloader used on Linux to store the contents
- * of the grubenv file. The file is stored as raw ASCII, and is protected by
- * an embedded checksum. By default, GRUB will check if the boot filesystem
- * supports storing the environment data in a special location, and if so,
- * will invoke filesystem specific logic to retrieve it. This can be overridden
- * by a variable, should the user so desire.
+ * and it is used by the bootloader(s) to store the bootloader and/or system
+ * specific data.
+ * The data is stored as nvlist data stream, and is protected by
+ * an embedded checksum.
+ * The version can have two possible values:
+ * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING.
+ * VB_NVLIST: nvlist with arbitrary <key, value> pairs.
*/
-/* ARGSUSED */
static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
- {"envmap", DATA_TYPE_STRING, 0},
+ {"version", DATA_TYPE_UINT64, 0},
+ {"<keys>", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST},
};
static int
zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
{
- char *envmap;
int error;
spa_t *spa;
- envmap = fnvlist_lookup_string(innvl, "envmap");
if ((error = spa_open(name, &spa, FTAG)) != 0)
return (error);
spa_vdev_state_enter(spa, SCL_ALL);
- error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap);
+ error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl);
(void) spa_vdev_state_exit(spa, NULL, 0);
spa_close(spa, FTAG);
return (error);
@@ -3544,7 +3543,6 @@ static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -7615,6 +7613,7 @@ zfs_kmod_fini(void)
kmem_free(zs, sizeof (zfsdev_state_t));
}
+ zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */
zfs_fini();
spa_fini();
zvol_fini();
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f956a9ef7621..8a8fbccd7d63 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Klara Inc.
@@ -547,7 +547,7 @@ error:
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
- spa, NULL, &zio->io_bookmark, zio, 0, 0);
+ spa, NULL, &zio->io_bookmark, zio, 0);
}
} else {
zio->io_error = ret;
@@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
pio->io_offset, pio->io_size, pio->io_error);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
- pio->io_spa, vd, zb, pio, 0, 0);
+ pio->io_spa, vd, zb, pio, 0);
if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
taskq_empty_ent(&pio->io_tqent)) {
@@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and has been suspended.\n", spa_name(spa));
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
- NULL, NULL, 0, 0);
+ NULL, NULL, 0);
mutex_enter(&spa->spa_suspend_lock);
@@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio)
zio->io_error = error;
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- mutex_enter(&zio->io_vd->vdev_stat_lock);
- zio->io_vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&zio->io_vd->vdev_stat_lock);
-
- zfs_ereport_start_checksum(zio->io_spa,
+ int ret = zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, NULL, &info);
+
+ if (ret != EALREADY) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ }
}
}
@@ -4543,7 +4545,7 @@ zio_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, &zio->io_bookmark,
- zio, 0, 0);
+ zio, 0);
}
}
}
@@ -4557,16 +4559,16 @@ zio_done(zio_t *zio)
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd)) {
- mutex_enter(&zio->io_vd->vdev_stat_lock);
- if (zio->io_type == ZIO_TYPE_READ) {
- zio->io_vd->vdev_stat.vs_read_errors++;
- } else if (zio->io_type == ZIO_TYPE_WRITE) {
- zio->io_vd->vdev_stat.vs_write_errors++;
+ int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
+ if (ret != EALREADY) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vd->vdev_stat.vs_read_errors++;
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ zio->io_vd->vdev_stat.vs_write_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
}
- mutex_exit(&zio->io_vd->vdev_stat_lock);
-
- (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
- zio->io_vd, &zio->io_bookmark, zio, 0, 0);
}
if ((zio->io_error == EIO || !(zio->io_flags &
@@ -4578,7 +4580,7 @@ zio_done(zio_t *zio)
*/
spa_log_error(zio->io_spa, &zio->io_bookmark);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
- zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);
+ zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
}
}
diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c
index fdc4b863382c..5ac2e30467e3 100644
--- a/module/zfs/zthr.c
+++ b/module/zfs/zthr.c
@@ -56,7 +56,7 @@
*
* == ZTHR creation
*
- * Every zthr needs three inputs to start running:
+ * Every zthr needs four inputs to start running:
*
* 1] A user-defined checker function (checkfunc) that decides whether
* the zthr should start working or go to sleep. The function should
@@ -72,6 +72,9 @@
* 3] A void args pointer that will be passed to checkfunc and func
* implicitly by the infrastructure.
*
+ * 4] A name for the thread. This string must be valid for the lifetime
+ * of the zthr.
+ *
* The reason why the above API needs two different functions,
* instead of one that both checks and does the work, has to do with
* the zthr's internal state lock (zthr_state_lock) and the allowed
@@ -221,6 +224,7 @@ struct zthr {
zthr_checkfunc_t *zthr_checkfunc;
zthr_func_t *zthr_func;
void *zthr_arg;
+ const char *zthr_name;
};
static void
@@ -237,15 +241,10 @@ zthr_procedure(void *arg)
t->zthr_func(t->zthr_arg, t);
mutex_enter(&t->zthr_state_lock);
} else {
- /*
- * cv_wait_sig() is used instead of cv_wait() in
- * order to prevent this process from incorrectly
- * contributing to the system load average when idle.
- */
if (t->zthr_sleep_timeout == 0) {
- cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock);
+ cv_wait_idle(&t->zthr_cv, &t->zthr_state_lock);
} else {
- (void) cv_timedwait_sig_hires(&t->zthr_cv,
+ (void) cv_timedwait_idle_hires(&t->zthr_cv,
&t->zthr_state_lock, t->zthr_sleep_timeout,
MSEC2NSEC(1), 0);
}
@@ -296,6 +295,7 @@ zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc,
t->zthr_func = func;
t->zthr_arg = arg;
t->zthr_sleep_timeout = max_sleep;
+ t->zthr_name = zthr_name;
t->zthr_thread = thread_create_named(zthr_name, NULL, 0,
zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
@@ -422,8 +422,8 @@ zthr_resume(zthr_t *t)
* no-op.
*/
if (t->zthr_thread == NULL) {
- t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
- 0, &p0, TS_RUN, minclsyspri);
+ t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0,
+ zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
}
mutex_exit(&t->zthr_state_lock);