src - FreeBSD source tree

diff options


context:
space:
mode:

author	Matt Macy <mmacy@FreeBSD.org>	2020-09-18 22:55:05 +0000
committer	Matt Macy <mmacy@FreeBSD.org>	2020-09-18 22:55:05 +0000
commit	04bab0082226f7e72c41ad528298c43edc3fa316 (patch)
tree	67433d716d54ef1de3f4b31a01227792c48506c2 /module/zfs
parent	b0a96e5e2d3c9480ec89dd4c034c7fe4f97abfe1 (diff)
download	src-04bab0082226f7e72c41ad528298c43edc3fa316.tar.gz src-04bab0082226f7e72c41ad528298c43edc3fa316.zip

Update openzfs to 2.0.0-rc2-g4ce06fvendor/openzfs/2.0-rc2-g4ce06f

Notes

Notes: svn path=/vendor-sys/openzfs/dist/; revision=365892 svn path=/vendor-sys/openzfs/2.0-rc2-g4ce06f/; revision=365893; tag=vendor/openzfs/2.0-rc2-g4ce06f

Diffstat (limited to 'module/zfs')

-rw-r--r--

module/zfs/arc.c

-rw-r--r--

module/zfs/dbuf.c

-rw-r--r--

module/zfs/dmu_redact.c

-rw-r--r--

module/zfs/dnode.c

-rw-r--r--

module/zfs/dsl_scan.c

-rw-r--r--

module/zfs/dsl_synctask.c

-rw-r--r--

module/zfs/fm.c

-rw-r--r--

module/zfs/metaslab.c

-rw-r--r--

module/zfs/mmp.c

-rw-r--r--

module/zfs/range_tree.c

-rw-r--r--

module/zfs/spa.c

-rw-r--r--

module/zfs/spa_config.c

-rw-r--r--

module/zfs/spa_history.c

-rw-r--r--

module/zfs/txg.c

-rw-r--r--

module/zfs/vdev.c

103

-rw-r--r--

module/zfs/vdev_indirect.c

-rw-r--r--

module/zfs/vdev_initialize.c

-rw-r--r--

module/zfs/vdev_label.c

114

-rw-r--r--

module/zfs/vdev_mirror.c

-rw-r--r--

module/zfs/vdev_raidz.c

-rw-r--r--

module/zfs/vdev_rebuild.c

-rw-r--r--

module/zfs/vdev_removal.c

-rw-r--r--

module/zfs/vdev_trim.c

-rw-r--r--

module/zfs/zfs_fm.c

361

-rw-r--r--

module/zfs/zfs_ioctl.c

-rw-r--r--

module/zfs/zio.c

-rw-r--r--

module/zfs/zthr.c

27 files changed, 714 insertions, 249 deletions

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 904c325f37a1..7a499298f75c 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c

@@ -21,7 +21,7 @@

@@ -895,6 +895,12 @@ static void l2arc_read_done(zio_t *);

static void l2arc_do_free_on_write(void);

+ * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU

+ * metadata and data are cached from ARC into L2ARC.

+ */

+int l2arc_mfuonly = 0;

+/*

* L2ARC TRIM

* l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of

* the current write size (l2arc_write_max) we should TRIM if we

@@ -2188,7 +2194,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,

ret = SET_ERROR(EIO);

spa_log_error(spa, zb);

(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,

- spa, NULL, zb, NULL, 0, 0);

+ spa, NULL, zb, NULL, 0);

}

return (ret);

@@ -5654,7 +5660,7 @@ arc_read_done(zio_t *zio)

spa_log_error(zio->io_spa, &acb->acb_zb);

(void) zfs_ereport_post(

FM_EREPORT_ZFS_AUTHENTICATION,

- zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);

+ zio->io_spa, NULL, &acb->acb_zb, zio, 0);

}

@@ -5931,7 +5937,7 @@ top:

spa_log_error(spa, zb);

(void) zfs_ereport_post(

FM_EREPORT_ZFS_AUTHENTICATION,

- spa, NULL, zb, NULL, 0, 0);

+ spa, NULL, zb, NULL, 0);

}

if (rc != 0) {

@@ -8909,6 +8915,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)

* Copy buffers for L2ARC writing.

for (int try = 0; try < L2ARC_FEED_TYPES; try++) {

+ /*

+ * If try == 1 or 3, we cache MRU metadata and data

+ * respectively.

+ */

+ if (l2arc_mfuonly) {

+ if (try == 1 || try == 3)

+ continue;

+ }

multilist_sublist_t *mls = l2arc_sublist_lock(try);

uint64_t passed_sz = 0;

@@ -9174,7 +9189,7 @@ l2arc_feed_thread(void *unused)

cookie = spl_fstrans_mark();

while (l2arc_thread_exit == 0) {

CALLB_CPR_SAFE_BEGIN(&cpr);

- (void) cv_timedwait_sig(&l2arc_feed_thr_cv,

+ (void) cv_timedwait_idle(&l2arc_feed_thr_cv,

&l2arc_feed_thr_lock, next);

CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);

next = ddi_get_lbolt() + hz;

@@ -9291,8 +9306,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)

ASSERT(!l2arc_vdev_present(vd));

- vdev_ashift_optimize(vd);

* Create a new l2arc device entry.

@@ -10562,6 +10575,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,

"Min size in bytes to write rebuild log blocks in L2ARC");

+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,

+ "Cache only MFU data from ARC into L2ARC");

ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,

param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 2de1f4e4c267..7d817320aae4 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c

@@ -718,7 +718,7 @@ dbuf_evict_thread(void *unused)

while (!dbuf_evict_thread_exit) {

while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {

CALLB_CPR_SAFE_BEGIN(&cpr);

- (void) cv_timedwait_sig_hires(&dbuf_evict_cv,

+ (void) cv_timedwait_idle_hires(&dbuf_evict_cv,

&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);

CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);

}

diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index df10d8d6faae..c53fba75cc51 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c

@@ -568,8 +568,7 @@ commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object,

uint64_t txg = dmu_tx_get_txg(tx);

if (!md->md_synctask_txg[txg & TXG_MASK]) {

dsl_sync_task_nowait(dmu_tx_pool(tx),

- redaction_list_update_sync, md, 5, ZFS_SPACE_CHECK_NONE,

- tx);

+ redaction_list_update_sync, md, tx);

md->md_synctask_txg[txg & TXG_MASK] = B_TRUE;

md->md_latest_synctask_txg = txg;

}

@@ -1007,10 +1006,14 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,

objset_t *os;

struct redact_thread_arg *args = NULL;

redaction_list_t *new_rl = NULL;

+ char *newredactbook;

if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0)

return (err);

+ newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN,

+ KM_SLEEP);

if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT,

FTAG, &ds)) != 0) {

goto out;

@@ -1064,7 +1067,6 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,

goto out;

boolean_t resuming = B_FALSE;

- char newredactbook[ZFS_MAX_DATASET_NAME_LEN];

zfs_bookmark_phys_t bookmark;

(void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN);

@@ -1074,6 +1076,10 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,

"#%s", redactbook);

if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) {

dsl_pool_rele(dp, FTAG);

+ kmem_free(newredactbook,

+ sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);

+ if (args != NULL)

+ kmem_free(args, numsnaps * sizeof (*args));

return (SET_ERROR(ENAMETOOLONG));

}

err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);

@@ -1146,16 +1152,23 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,

(void) thread_create(NULL, 0, redact_traverse_thread, rta,

0, curproc, TS_RUN, minclsyspri);

}

- struct redact_merge_thread_arg rmta = { { {0} } };

- (void) bqueue_init(&rmta.q, zfs_redact_queue_ff,

+ struct redact_merge_thread_arg *rmta;

+ rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP);

+ (void) bqueue_init(&rmta->q, zfs_redact_queue_ff,

zfs_redact_queue_length, offsetof(struct redact_record, ln));

- rmta.numsnaps = numsnaps;

- rmta.spa = os->os_spa;

- rmta.thr_args = args;

- (void) thread_create(NULL, 0, redact_merge_thread, &rmta, 0, curproc,

+ rmta->numsnaps = numsnaps;

+ rmta->spa = os->os_spa;

+ rmta->thr_args = args;

+ (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,

TS_RUN, minclsyspri);

- err = perform_redaction(os, new_rl, &rmta);

+ err = perform_redaction(os, new_rl, rmta);

+ kmem_free(rmta, sizeof (struct redact_merge_thread_arg));

out:

+ kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);

if (new_rl != NULL) {

dsl_redaction_list_long_rele(new_rl, FTAG);

dsl_redaction_list_rele(new_rl, FTAG);

diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 00536f2774e7..30d20bfefa12 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c

@@ -1197,7 +1197,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,

dnode_t *dn;

zrl_init(&dnh->dnh_zrlock);

- zrl_tryenter(&dnh->dnh_zrlock);

+ VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));

dn = dnode_create(os, dnp, NULL, object, dnh);

DNODE_VERIFY(dn);

@@ -1949,18 +1949,20 @@ static void

dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,

dmu_tx_t *tx)

{

- dmu_buf_impl_t db_search;

+ dmu_buf_impl_t *db_search;

dmu_buf_impl_t *db;

avl_index_t where;

+ db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);

mutex_enter(&dn->dn_dbufs_mtx);

- db_search.db_level = 1;

- db_search.db_blkid = start_blkid + 1;

- db_search.db_state = DB_SEARCH;

+ db_search->db_level = 1;

+ db_search->db_blkid = start_blkid + 1;

+ db_search->db_state = DB_SEARCH;

for (;;) {

- db = avl_find(&dn->dn_dbufs, &db_search, &where);

+ db = avl_find(&dn->dn_dbufs, db_search, &where);

if (db == NULL)

db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);

@@ -1972,7 +1974,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,

* Setup the next blkid we want to search for.

- db_search.db_blkid = db->db_blkid + 1;

+ db_search->db_blkid = db->db_blkid + 1;

ASSERT3U(db->db_blkid, >=, start_blkid);

@@ -1992,10 +1994,10 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,

* Walk all the in-core level-1 dbufs and verify they have been dirtied.

- db_search.db_level = 1;

- db_search.db_blkid = start_blkid + 1;

- db_search.db_state = DB_SEARCH;

- db = avl_find(&dn->dn_dbufs, &db_search, &where);

+ db_search->db_level = 1;

+ db_search->db_blkid = start_blkid + 1;

+ db_search->db_state = DB_SEARCH;

+ db = avl_find(&dn->dn_dbufs, db_search, &where);

if (db == NULL)

db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);

for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {

@@ -2005,6 +2007,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,

ASSERT(db->db_dirtycnt > 0);

}

#endif

+ kmem_free(db_search, sizeof (dmu_buf_impl_t));

mutex_exit(&dn->dn_dbufs_mtx);

}

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 712af664e90f..0ebda2f77074 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c

@@ -23,6 +23,7 @@

diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c
index 2d6ca8549eb9..148e8fff2437 100644
--- a/module/zfs/dsl_synctask.c
+++ b/module/zfs/dsl_synctask.c

@@ -170,15 +170,13 @@ dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,

static void

dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,

- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,

- boolean_t early)

+ dmu_tx_t *tx, boolean_t early)

{

dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);

dst->dst_pool = dp;

dst->dst_txg = dmu_tx_get_txg(tx);

- dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;

- dst->dst_space_check = space_check;

+ dst->dst_space_check = ZFS_SPACE_CHECK_NONE;

dst->dst_checkfunc = dsl_null_checkfunc;

dst->dst_syncfunc = syncfunc;

dst->dst_arg = arg;

@@ -192,18 +190,16 @@ dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,

void

dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,

- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)

+ dmu_tx_t *tx)

{

- dsl_sync_task_nowait_common(dp, syncfunc, arg,

- blocks_modified, space_check, tx, B_FALSE);

+ dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE);

}

void

dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,

- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)

+ dmu_tx_t *tx)

{

- dsl_sync_task_nowait_common(dp, syncfunc, arg,

- blocks_modified, space_check, tx, B_TRUE);

+ dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE);

}

diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index c00e08b8d02a..a5003f85d621 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c

@@ -104,13 +104,15 @@ struct erpt_kstat {

kstat_named_t erpt_set_failed; /* num erpt set failures */

kstat_named_t fmri_set_failed; /* num fmri set failures */

kstat_named_t payload_set_failed; /* num payload set failures */

+ kstat_named_t erpt_duplicates; /* num duplicate erpts */

};

static struct erpt_kstat erpt_kstat_data = {

{ "erpt-dropped", KSTAT_DATA_UINT64 },

{ "erpt-set-failed", KSTAT_DATA_UINT64 },

{ "fmri-set-failed", KSTAT_DATA_UINT64 },

- { "payload-set-failed", KSTAT_DATA_UINT64 }

+ { "payload-set-failed", KSTAT_DATA_UINT64 },

+ { "erpt-duplicates", KSTAT_DATA_UINT64 }

};

kstat_t *fm_ksp;

@@ -568,6 +570,12 @@ out:

return (error);

}

+void

+zfs_zevent_track_duplicate(void)

+ atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);

static int

zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)

{

@@ -1633,6 +1641,8 @@ fm_init(void)

list_create(&zevent_list, sizeof (zevent_t),

offsetof(zevent_t, ev_node));

cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);

+ zfs_ereport_init();

}

void

@@ -1640,6 +1650,8 @@ fm_fini(void)

{

int count;

+ zfs_ereport_fini();

zfs_zevent_drain_all(&count);

mutex_enter(&zevent_lock);

diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index ccc247d1557a..133005b227e5 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c

@@ -22,6 +22,7 @@

diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 4170d7e03ebd..99852521b6d1 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c

@@ -198,14 +198,6 @@ mmp_init(spa_t *spa)

cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);

mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);

mmp->mmp_kstat_id = 1;

- /*

- * mmp_write_done() calculates mmp_delay based on prior mmp_delay and

- * the elapsed time since the last write. For the first mmp write,

- * there is no "last write", so we start with fake non-zero values.

- */

- mmp->mmp_last_write = gethrtime();

- mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));

}

void

@@ -557,6 +549,18 @@ mmp_thread(void *arg)

mmp_thread_enter(mmp, &cpr);

+ /*

+ * There have been no MMP writes yet. Setting mmp_last_write here gives

+ * us one mmp_fail_ns period, which is consistent with the activity

+ * check duration, to try to land an MMP write before MMP suspends the

+ * pool (if so configured).

+ */

+ mutex_enter(&mmp->mmp_io_lock);

+ mmp->mmp_last_write = gethrtime();

+ mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));

+ mutex_exit(&mmp->mmp_io_lock);

while (!mmp->mmp_thread_exiting) {

hrtime_t next_time = gethrtime() +

MSEC2NSEC(MMP_DEFAULT_INTERVAL);

@@ -671,7 +675,7 @@ mmp_thread(void *arg)

}

CALLB_CPR_SAFE_BEGIN(&cpr);

- (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,

+ (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,

&mmp->mmp_thread_lock, next_time, USEC2NSEC(100),

CALLOUT_FLAG_ABSOLUTE);

CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);

diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 2c0e4b860a04..2ce0139c9137 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c

@@ -24,6 +24,7 @@

#include <sys/zfs_context.h>

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index aac469f44b59..532f04b91ca1 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c

@@ -21,7 +21,7 @@

@@ -1000,13 +1000,25 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)

* The write issue taskq can be extremely CPU

* intensive. Run it at slightly less important

- * priority than the other taskqs. Under Linux this

- * means incrementing the priority value on platforms

- * like illumos it should be decremented.

+ * priority than the other taskqs.

+ *

+ * Under Linux and FreeBSD this means incrementing

+ * the priority value as opposed to platforms like

+ * illumos where it should be decremented.

+ *

+ * On FreeBSD, if priorities divided by four (RQ_PPQ)

+ * are equal then a difference between them is

+ * insignificant.

- if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)

+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {

+#if defined(__linux__)

pri++;

+#elif defined(__FreeBSD__)

+ pri += 4;

+#else

+#error "unknown OS"

+#endif

+ }

tq = taskq_create_proc(name, value, pri, 50,

INT_MAX, spa->spa_proc, flags);

}

@@ -2485,11 +2497,12 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)

VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));

VERIFY0(zap_count(mos, ll_obj, &count));

if (count > 0) {

- dsl_deadlist_t ll = { 0 };

+ dsl_deadlist_t *ll;

dsl_deadlist_entry_t *dle;

bplist_t to_free;

- dsl_deadlist_open(&ll, mos, ll_obj);

- dle = dsl_deadlist_first(&ll);

+ ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);

+ dsl_deadlist_open(ll, mos, ll_obj);

+ dle = dsl_deadlist_first(ll);

ASSERT3P(dle, !=, NULL);

bplist_create(&to_free);

int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,

@@ -2497,7 +2510,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)

if (err == 0) {

sublist_delete_arg_t sync_arg = {

.spa = spa,

- .ll = &ll,

+ .ll = ll,

.key = dle->dle_mintxg,

.to_free = &to_free

};

@@ -2512,7 +2525,8 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)

}

bplist_clear(&to_free);

bplist_destroy(&to_free);

- dsl_deadlist_close(&ll);

+ dsl_deadlist_close(ll);

+ kmem_free(ll, sizeof (dsl_deadlist_t));

} else {

livelist_delete_arg_t sync_arg = {

.spa = spa,

@@ -2688,8 +2702,7 @@ spa_livelist_condense_cb(void *arg, zthr_t *t)

lca->first_size = first_size;

lca->next_size = next_size;

dsl_sync_task_nowait(spa_get_dsl(spa),

- spa_livelist_condense_sync, lca, 0,

- ZFS_SPACE_CHECK_NONE, tx);

+ spa_livelist_condense_sync, lca, tx);

dmu_tx_commit(tx);

return;

}

@@ -2869,7 +2882,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)

}

if (error != EBADF) {

(void) zfs_ereport_post(ereport, spa,

- NULL, NULL, NULL, 0, 0);

+ NULL, NULL, NULL, 0);

}

spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;

@@ -5749,7 +5762,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,

for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {

vdev_t *vd = rvd->vdev_child[c];

- vdev_ashift_optimize(vd);

vdev_metaslab_set_size(vd);

vdev_expand(vd, txg);

}

diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index b98b7badbae1..dacba127dcfa 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c

@@ -22,7 +22,7 @@

@@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)

if (target->spa_ccw_fail_time == 0) {

(void) zfs_ereport_post(

FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,

- target, NULL, NULL, NULL, 0, 0);

+ target, NULL, NULL, NULL, 0);

}

target->spa_ccw_fail_time = gethrtime();

spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);

@@ -577,10 +577,8 @@ spa_config_update(spa_t *spa, int what)

(tvd->vdev_islog && tvd->vdev_removing))

continue;

- if (tvd->vdev_ms_array == 0) {

- vdev_ashift_optimize(tvd);

+ if (tvd->vdev_ms_array == 0)

vdev_metaslab_set_size(tvd);

- }

vdev_expand(tvd, txg);

}

diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c
index f47adb94d55b..2ab58815400a 100644
--- a/module/zfs/spa_history.c
+++ b/module/zfs/spa_history.c

@@ -397,8 +397,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)

fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));

/* Kick this off asynchronously; errors are ignored. */

- dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,

- nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);

+ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx);

dmu_tx_commit(tx);

/* spa_history_log_sync will free nvl */

@@ -532,7 +531,7 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,

spa_history_log_sync(nvl, tx);

} else {

dsl_sync_task_nowait(spa_get_dsl(spa),

- spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);

+ spa_history_log_sync, nvl, tx);

}

/* spa_history_log_sync() will free nvl */

}

diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index a5f2b041737b..65375b579da6 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c

@@ -242,16 +242,11 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)

{

CALLB_CPR_SAFE_BEGIN(cpr);

- /*

- * cv_wait_sig() is used instead of cv_wait() in order to prevent

- * this process from incorrectly contributing to the system load

- * average when idle.

- */

if (time) {

- (void) cv_timedwait_sig(cv, &tx->tx_sync_lock,

+ (void) cv_timedwait_idle(cv, &tx->tx_sync_lock,

ddi_get_lbolt() + time);

} else {

- cv_wait_sig(cv, &tx->tx_sync_lock);

+ cv_wait_idle(cv, &tx->tx_sync_lock);

}

CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);

@@ -760,7 +755,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)

if (should_quiesce == B_TRUE) {

cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);

} else {

- cv_wait_sig(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);

+ cv_wait_idle(&tx->tx_quiesce_done_cv,

+ &tx->tx_sync_lock);

}

mutex_exit(&tx->tx_sync_lock);

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 95a2f5947db1..a94101485c94 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c

@@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio)

ASSERT(zio->io_error != 0);

vdev_dbgmsg(vd, "failed probe");

(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,

- spa, vd, NULL, NULL, 0, 0);

+ spa, vd, NULL, NULL, 0);

zio->io_error = SET_ERROR(ENXIO);

}

@@ -1673,6 +1673,38 @@ vdev_set_deflate_ratio(vdev_t *vd)

}

+ * Maximize performance by inflating the configured ashift for top level

+ * vdevs to be as close to the physical ashift as possible while maintaining

+ * administrator defined limits and ensuring it doesn't go below the

+ * logical ashift.

+ */

+static void

+vdev_ashift_optimize(vdev_t *vd)

+ ASSERT(vd == vd->vdev_top);

+ if (vd->vdev_ashift < vd->vdev_physical_ashift) {

+ vd->vdev_ashift = MIN(

+ MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),

+ MAX(zfs_vdev_min_auto_ashift,

+ vd->vdev_physical_ashift));

+ } else {

+ /*

+ * If the logical and physical ashifts are the same, then

+ * we ensure that the top-level vdev's ashift is not smaller

+ * than our minimum ashift value. For the unusual case

+ * where logical ashift > physical ashift, we can't cap

+ * the calculated ashift based on max ashift as that

+ * would cause failures.

+ * We still check if we need to increase it to match

+ * the min ashift.

+ */

+ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,

+ vd->vdev_ashift);

+ }

+/*

* Prepare a virtual device for access.

int

@@ -1830,16 +1862,17 @@ vdev_open(vdev_t *vd)

return (SET_ERROR(EINVAL));

}

+ /*

+ * We can always set the logical/physical ashift members since

+ * their values are only used to calculate the vdev_ashift when

+ * the device is first added to the config. These values should

+ * not be used for anything else since they may change whenever

+ * the device is reopened and we don't store them in the label.

+ */

vd->vdev_physical_ashift =

MAX(physical_ashift, vd->vdev_physical_ashift);

- vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);

- vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);

- if (vd->vdev_logical_ashift > ASHIFT_MAX) {

- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,

- VDEV_AUX_ASHIFT_TOO_BIG);

- return (SET_ERROR(EDOM));

- }

+ vd->vdev_logical_ashift = MAX(logical_ashift,

+ vd->vdev_logical_ashift);

if (vd->vdev_asize == 0) {

@@ -1848,6 +1881,24 @@ vdev_open(vdev_t *vd)

vd->vdev_asize = asize;

vd->vdev_max_asize = max_asize;

+ /*

+ * If the vdev_ashift was not overriden at creation time,

+ * then set it the logical ashift and optimize the ashift.

+ */

+ if (vd->vdev_ashift == 0) {

+ vd->vdev_ashift = vd->vdev_logical_ashift;

+ if (vd->vdev_logical_ashift > ASHIFT_MAX) {

+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,

+ VDEV_AUX_ASHIFT_TOO_BIG);

+ return (SET_ERROR(EDOM));

+ }

+ if (vd->vdev_top == vd) {

+ vdev_ashift_optimize(vd);

+ }

if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||

vd->vdev_ashift > ASHIFT_MAX)) {

vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,

@@ -1862,11 +1913,10 @@ vdev_open(vdev_t *vd)

vd->vdev_ops->vdev_op_leaf) {

(void) zfs_ereport_post(

FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,

- spa, vd, NULL, NULL, 0, 0);

+ spa, vd, NULL, NULL, 0);

vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,

VDEV_AUX_BAD_LABEL);

return (SET_ERROR(EDOM));

}

vd->vdev_max_asize = max_asize;

}

@@ -2445,35 +2495,6 @@ vdev_metaslab_set_size(vdev_t *vd)

ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);

}

-/*

- * Maximize performance by inflating the configured ashift for top level

- * vdevs to be as close to the physical ashift as possible while maintaining

- * administrator defined limits and ensuring it doesn't go below the

- * logical ashift.

- */

-void

-vdev_ashift_optimize(vdev_t *vd)

- if (vd == vd->vdev_top) {

- if (vd->vdev_ashift < vd->vdev_physical_ashift) {

- vd->vdev_ashift = MIN(

- MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),

- MAX(zfs_vdev_min_auto_ashift,

- vd->vdev_physical_ashift));

- } else {

- /*

- * Unusual case where logical ashift > physical ashift

- * so we can't cap the calculated ashift based on max

- * ashift as that would cause failures.

- * We still check if we need to increase it to match

- * the min ashift.

- */

- vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,

- vd->vdev_ashift);

- }

void

vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)

{

@@ -4759,7 +4780,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)

}

(void) zfs_ereport_post(class, spa, vd, NULL, NULL,

- save_state, 0);

+ save_state);

}

/* Erase any notion of persistent removed state */

diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 6bc2d917d59c..12ee393bd5db 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c

@@ -16,7 +16,7 @@

#include <sys/zfs_context.h>

@@ -576,8 +576,7 @@ spa_condense_indirect_commit_entry(spa_t *spa,

if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {

dsl_sync_task_nowait(dmu_tx_pool(tx),

- spa_condense_indirect_commit_sync, sci,

- 0, ZFS_SPACE_CHECK_NONE, tx);

+ spa_condense_indirect_commit_sync, sci, tx);

}

vdev_indirect_mapping_entry_t *vime =

@@ -1474,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio)

vdev_t *vd = ic->ic_vdev;

- mutex_enter(&vd->vdev_stat_lock);

- vd->vdev_stat.vs_checksum_errors++;

- mutex_exit(&vd->vdev_stat_lock);

- (void) zfs_ereport_post_checksum(zio->io_spa, vd,

+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,

NULL, zio, is->is_target_offset, is->is_size,

NULL, NULL, NULL);

+ if (ret != EALREADY) {

+ mutex_enter(&vd->vdev_stat_lock);

+ vd->vdev_stat.vs_checksum_errors++;

+ mutex_exit(&vd->vdev_stat_lock);

+ }

}

diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index ab711441d9ca..7ff7fffcc80e 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c

@@ -126,7 +126,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)

dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);

VERIFY0(dmu_tx_assign(tx, TXG_WAIT));

dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,

- guid, 2, ZFS_SPACE_CHECK_NONE, tx);

+ guid, tx);

switch (new_state) {

case VDEV_INITIALIZE_ACTIVE:

@@ -216,8 +216,7 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)

/* This is the first write of this txg. */

dsl_sync_task_nowait(spa_get_dsl(spa),

- vdev_initialize_zap_update_sync, guid, 2,

- ZFS_SPACE_CHECK_RESERVED, tx);

+ vdev_initialize_zap_update_sync, guid, tx);

}

diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 8c7468255565..7fab7d0d7950 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c

@@ -149,6 +149,8 @@

#include <sys/dsl_scan.h>

#include <sys/abd.h>

#include <sys/fs/zfs.h>

+#include <sys/byteorder.h>

+#include <sys/zfs_bootenv.h>

* Basic routines to read and write from a vdev label.

@@ -1233,13 +1235,9 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)

* bootloader should have rewritten them all to be the same on boot,

* and any changes we made since boot have been the same across all

* labels.

- *

- * While grub supports writing to all four labels, other bootloaders

- * don't, so we only use the first two labels to store boot

- * information.

if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {

- for (int l = 0; l < VDEV_LABELS / 2; l++) {

+ for (int l = 0; l < VDEV_LABELS; l++) {

vdev_label_read(zio, vd, l,

abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),

offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,

@@ -1249,14 +1247,15 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)

}

int

-vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command)

+vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)

{

+ nvlist_t *config;

spa_t *spa = rvd->vdev_spa;

abd_t *abd = NULL;

int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |

ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;

- ASSERT(command);

+ ASSERT(bootenv);

ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

zio_t *zio = zio_root(spa, NULL, &abd, flags);

@@ -1264,39 +1263,81 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command)

int err = zio_wait(zio);

if (abd != NULL) {

+ char *buf;

vdev_boot_envblock_t *vbe = abd_to_buf(abd);

- if (vbe->vbe_version != VB_RAW) {

- abd_free(abd);

- return (SET_ERROR(ENOTSUP));

+ vbe->vbe_version = ntohll(vbe->vbe_version);

+ switch (vbe->vbe_version) {

+ case VB_RAW:

+ /*

+ * if we have textual data in vbe_bootenv, create nvlist

+ * with key "envmap".

+ */

+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);

+ vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';

+ fnvlist_add_string(bootenv, GRUB_ENVMAP,

+ vbe->vbe_bootenv);

+ break;

+ case VB_NVLIST:

+ err = nvlist_unpack(vbe->vbe_bootenv,

+ sizeof (vbe->vbe_bootenv), &config, 0);

+ if (err == 0) {

+ fnvlist_merge(bootenv, config);

+ nvlist_free(config);

+ break;

+ }

+ /* FALLTHROUGH */

+ default:

+ /* Check for FreeBSD zfs bootonce command string */

+ buf = abd_to_buf(abd);

+ if (*buf == '\0') {

+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION,

+ VB_NVLIST);

+ break;

+ }

+ fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);

}

- vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';

- fnvlist_add_string(command, "envmap", vbe->vbe_bootenv);

- /* abd was allocated in vdev_label_read_bootenv_impl() */

+ /*

+ * abd was allocated in vdev_label_read_bootenv_impl()

+ */

abd_free(abd);

- /* If we managed to read any successfully, return success. */

+ /*

+ * If we managed to read any successfully,

+ * return success.

+ */

return (0);

}

return (err);

}

int

-vdev_label_write_bootenv(vdev_t *vd, char *envmap)

+vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)

{

zio_t *zio;

spa_t *spa = vd->vdev_spa;

vdev_boot_envblock_t *bootenv;

int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;

- int error = ENXIO;

+ int error;

+ size_t nvsize;

+ char *nvbuf;

+ error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);

+ if (error != 0)

+ return (SET_ERROR(error));

- if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) {

+ if (nvsize >= sizeof (bootenv->vbe_bootenv)) {

return (SET_ERROR(E2BIG));

}

ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

+ error = ENXIO;

for (int c = 0; c < vd->vdev_children; c++) {

- int child_err = vdev_label_write_bootenv(vd->vdev_child[c],

- envmap);

+ int child_err;

+ child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);

* As long as any of the disks managed to write all of their

* labels successfully, return success.

@@ -1312,16 +1353,41 @@ vdev_label_write_bootenv(vdev_t *vd, char *envmap)

ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);

abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);

abd_zero(abd, VDEV_PAD_SIZE);

bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);

+ nvbuf = bootenv->vbe_bootenv;

+ nvsize = sizeof (bootenv->vbe_bootenv);

+ bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);

+ switch (bootenv->vbe_version) {

+ case VB_RAW:

+ if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {

+ (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);

+ }

+ error = 0;

+ break;

- char *buf = bootenv->vbe_bootenv;

- (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv));

- bootenv->vbe_version = VB_RAW;

- abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);

+ case VB_NVLIST:

+ error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,

+ KM_SLEEP);

+ break;

+ default:

+ error = EINVAL;

+ break;

+ }

+ if (error == 0) {

+ bootenv->vbe_version = htonll(bootenv->vbe_version);

+ abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);

+ } else {

+ abd_free(abd);

+ return (SET_ERROR(error));

+ }

retry:

zio = zio_root(spa, NULL, NULL, flags);

- for (int l = 0; l < VDEV_LABELS / 2; l++) {

+ for (int l = 0; l < VDEV_LABELS; l++) {

vdev_label_write(zio, vd, l, abd,

offsetof(vdev_label_t, vl_be),

VDEV_PAD_SIZE, NULL, NULL, flags);

diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 5e1060f127c9..71b5adbbd06a 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c

@@ -391,7 +391,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,

*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;

*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);

*physical_ashift = MAX(*physical_ashift,

- vd->vdev_physical_ashift);

+ cvd->vdev_physical_ashift);

}

if (numerrors == vd->vdev_children) {

diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 4320078b6f7c..47312e02f70a 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c

@@ -21,7 +21,7 @@

@@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)

zio_bad_cksum_t zbc;

raidz_map_t *rm = zio->io_vsd;

- mutex_enter(&vd->vdev_stat_lock);

- vd->vdev_stat.vs_checksum_errors++;

- mutex_exit(&vd->vdev_stat_lock);

zbc.zbc_has_cksum = 0;

zbc.zbc_injected = rm->rm_ecksuminjected;

- (void) zfs_ereport_post_checksum(zio->io_spa, vd,

+ int ret = zfs_ereport_post_checksum(zio->io_spa, vd,

&zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,

rc->rc_abd, bad_data, &zbc);

+ if (ret != EALREADY) {

+ mutex_enter(&vd->vdev_stat_lock);

+ vd->vdev_stat.vs_checksum_errors++;

+ mutex_exit(&vd->vdev_stat_lock);

+ }

}

@@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio)

vdev_t *cvd;

rc = &rm->rm_col[c];

cvd = vd->vdev_child[rc->rc_devidx];

- if (rc->rc_error == 0) {

- zio_bad_cksum_t zbc;

- zbc.zbc_has_cksum = 0;

- zbc.zbc_injected =

- rm->rm_ecksuminjected;

+ if (rc->rc_error != 0)

+ continue;

+ zio_bad_cksum_t zbc;

+ zbc.zbc_has_cksum = 0;

+ zbc.zbc_injected = rm->rm_ecksuminjected;

+ int ret = zfs_ereport_start_checksum(

+ zio->io_spa, cvd, &zio->io_bookmark, zio,

+ rc->rc_offset, rc->rc_size,

+ (void *)(uintptr_t)c, &zbc);

+ if (ret != EALREADY) {

mutex_enter(&cvd->vdev_stat_lock);

cvd->vdev_stat.vs_checksum_errors++;

mutex_exit(&cvd->vdev_stat_lock);

- zfs_ereport_start_checksum(

- zio->io_spa, cvd,

- &zio->io_bookmark, zio,

- rc->rc_offset, rc->rc_size,

- (void *)(uintptr_t)c, &zbc);

}

diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 85ed8afe1cf4..3362d608c037 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c

@@ -267,7 +267,7 @@ vdev_rebuild_initiate(vdev_t *vd)

vd->vdev_rebuilding = B_TRUE;

dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,

- (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);

+ (void *)(uintptr_t)vd->vdev_id, tx);

dmu_tx_commit(tx);

vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);

@@ -553,8 +553,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)

vr->vr_scan_offset[txg & TXG_MASK] = start;

dsl_sync_task_nowait(spa_get_dsl(spa),

vdev_rebuild_update_sync,

- (void *)(uintptr_t)vd->vdev_id, 2,

- ZFS_SPACE_CHECK_RESERVED, tx);

+ (void *)(uintptr_t)vd->vdev_id, tx);

}

/* When exiting write out our progress. */

@@ -875,16 +874,14 @@ vdev_rebuild_thread(void *arg)

* by a pool checkpoint. See the dsl_scan_done() comments.

dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,

- (void *)(uintptr_t)vd->vdev_id, 0,

- ZFS_SPACE_CHECK_NONE, tx);

+ (void *)(uintptr_t)vd->vdev_id, tx);

} else if (vd->vdev_rebuild_cancel_wanted) {

* The rebuild operation was canceled. This will occur when

* a device participating in the rebuild is detached.

dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,

- (void *)(uintptr_t)vd->vdev_id, 0,

- ZFS_SPACE_CHECK_NONE, tx);

+ (void *)(uintptr_t)vd->vdev_id, tx);

} else if (vd->vdev_rebuild_reset_wanted) {

* Reset the running rebuild without canceling and restarting

@@ -892,8 +889,7 @@ vdev_rebuild_thread(void *arg)

* participate in the rebuild.

dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,

- (void *)(uintptr_t)vd->vdev_id, 0,

- ZFS_SPACE_CHECK_NONE, tx);

+ (void *)(uintptr_t)vd->vdev_id, tx);

} else {

* The rebuild operation should be suspended. This may occur

diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 56e420871f61..fdeca7ab3418 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c

@@ -1167,8 +1167,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)

/* After this, we can not use svr. */

tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);

- dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,

- 0, ZFS_SPACE_CHECK_NONE, tx);

+ dsl_sync_task_nowait(spa->spa_dsl_pool,

+ vdev_remove_complete_sync, svr, tx);

dmu_tx_commit(tx);

}

@@ -1317,7 +1317,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,

if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {

dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,

- svr, 0, ZFS_SPACE_CHECK_NONE, tx);

+ svr, tx);

}

svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);

@@ -2143,8 +2143,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)

vdev_config_dirty(vd);

dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);

dsl_sync_task_nowait(spa->spa_dsl_pool,

- vdev_remove_initiate_sync,

- (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);

+ vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);

dmu_tx_commit(tx);

return (0);

diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 3f8c34806020..02b42ddd5a6c 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c

@@ -317,7 +317,7 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,

dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);

VERIFY0(dmu_tx_assign(tx, TXG_WAIT));

dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,

- guid, 2, ZFS_SPACE_CHECK_NONE, tx);

+ guid, tx);

switch (new_state) {

case VDEV_TRIM_ACTIVE:

@@ -481,7 +481,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)

if (ta->trim_type == TRIM_TYPE_MANUAL) {

while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&

vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {

- cv_timedwait_sig(&vd->vdev_trim_io_cv,

+ cv_timedwait_idle(&vd->vdev_trim_io_cv,

&vd->vdev_trim_io_lock, ddi_get_lbolt() +

MSEC_TO_TICK(10));

}

@@ -510,8 +510,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)

/* This is the first write of this txg. */

dsl_sync_task_nowait(spa_get_dsl(spa),

- vdev_trim_zap_update_sync, guid, 2,

- ZFS_SPACE_CHECK_RESERVED, tx);

+ vdev_trim_zap_update_sync, guid, tx);

}

diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index ad13ccedfc06..a8341f50ba09 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c

@@ -24,7 +24,7 @@

#include <sys/spa.h>

@@ -101,7 +101,251 @@

* good and bad versions of the buffer (if available), and we annotate the

* ereport with information about the differences.

#ifdef _KERNEL

+/*

+ * Duplicate ereport Detection

+ *

+ * Some ereports are retained momentarily for detecting duplicates. These

+ * are kept in a recent_events_node_t in both a time-ordered list and an AVL

+ * tree of recent unique ereports.

+ *

+ * The lifespan of these recent ereports is bounded (15 mins) and a cleaner

+ * task is used to purge stale entries.

+ */

+static list_t recent_events_list;

+static avl_tree_t recent_events_tree;

+static kmutex_t recent_events_lock;

+static taskqid_t recent_events_cleaner_tqid;

+/*

+ * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.

+ *

+ * This setting can be changed dynamically and setting it to zero

+ * disables duplicate detection.

+ */

+unsigned int zfs_zevent_retain_max = 2000;

+/*

+ * The lifespan for a recent ereport entry. The default of 15 minutes is

+ * intended to outlive the zfs diagnosis engine's threshold of 10 errors

+ * over a period of 10 minutes.

+ */

+unsigned int zfs_zevent_retain_expire_secs = 900;

+typedef enum zfs_subclass {

+ ZSC_IO,

+ ZSC_DATA,

+ ZSC_CHECKSUM

+} zfs_subclass_t;

+typedef struct {

+ /* common criteria */

+ uint64_t re_pool_guid;

+ uint64_t re_vdev_guid;

+ int re_io_error;

+ uint64_t re_io_size;

+ uint64_t re_io_offset;

+ zfs_subclass_t re_subclass;

+ zio_priority_t re_io_priority;

+ /* logical zio criteria (optional) */

+ zbookmark_phys_t re_io_bookmark;

+ /* internal state */

+ avl_node_t re_tree_link;

+ list_node_t re_list_link;

+ uint64_t re_timestamp;

+} recent_events_node_t;

+static int

+recent_events_compare(const void *a, const void *b)

+ const recent_events_node_t *node1 = a;

+ const recent_events_node_t *node2 = b;

+ int cmp;

+ /*

+ * The comparison order here is somewhat arbitrary.

+ * What's important is that if every criteria matches, then it

+ * is a duplicate (i.e. compare returns 0)

+ */

+ if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)

+ return (cmp);

+ const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;

+ const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;

+ if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)

+ return (cmp);

+ if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)

+ return (cmp);

+ return (0);

+static void zfs_ereport_schedule_cleaner(void);

+/*

+ * background task to clean stale recent event nodes.

+ */

+/*ARGSUSED*/

+static void

+zfs_ereport_cleaner(void *arg)

+ recent_events_node_t *entry;

+ uint64_t now = gethrtime();

+ /*

+ * purge expired entries

+ */

+ mutex_enter(&recent_events_lock);

+ while ((entry = list_tail(&recent_events_list)) != NULL) {

+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);

+ if (age <= zfs_zevent_retain_expire_secs)

+ break;

+ /* remove expired node */

+ avl_remove(&recent_events_tree, entry);

+ list_remove(&recent_events_list, entry);

+ kmem_free(entry, sizeof (*entry));

+ }

+ /* Restart the cleaner if more entries remain */

+ recent_events_cleaner_tqid = 0;

+ if (!list_is_empty(&recent_events_list))

+ zfs_ereport_schedule_cleaner();

+ mutex_exit(&recent_events_lock);

+static void

+zfs_ereport_schedule_cleaner(void)

+ ASSERT(MUTEX_HELD(&recent_events_lock));

+ uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);

+ recent_events_cleaner_tqid = taskq_dispatch_delay(

+ system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,

+ ddi_get_lbolt() + NSEC_TO_TICK(timeout));

+/*

+ * Check if an ereport would be a duplicate of one recently posted.

+ *

+ * An ereport is considered a duplicate if the set of criteria in

+ * recent_events_node_t all match.

+ *

+ * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM

+ * are candidates for duplicate checking.

+ */

+static boolean_t

+zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,

+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)

+ recent_events_node_t search = {0}, *entry;

+ if (vd == NULL || zio == NULL)

+ return (B_FALSE);

+ if (zfs_zevent_retain_max == 0)

+ return (B_FALSE);

+ if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)

+ search.re_subclass = ZSC_IO;

+ else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)

+ search.re_subclass = ZSC_DATA;

+ else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)

+ search.re_subclass = ZSC_CHECKSUM;

+ else

+ return (B_FALSE);

+ search.re_pool_guid = spa_guid(spa);

+ search.re_vdev_guid = vd->vdev_guid;

+ search.re_io_error = zio->io_error;

+ search.re_io_priority = zio->io_priority;

+ /* if size is supplied use it over what's in zio */

+ if (size) {

+ search.re_io_size = size;

+ search.re_io_offset = offset;

+ } else {

+ search.re_io_size = zio->io_size;

+ search.re_io_offset = zio->io_offset;

+ }

+ /* grab optional logical zio criteria */

+ if (zb != NULL) {

+ search.re_io_bookmark.zb_objset = zb->zb_objset;

+ search.re_io_bookmark.zb_object = zb->zb_object;

+ search.re_io_bookmark.zb_level = zb->zb_level;

+ search.re_io_bookmark.zb_blkid = zb->zb_blkid;

+ }

+ uint64_t now = gethrtime();

+ mutex_enter(&recent_events_lock);

+ /* check if we have seen this one recently */

+ entry = avl_find(&recent_events_tree, &search, NULL);

+ if (entry != NULL) {

+ uint64_t age = NSEC2SEC(now - entry->re_timestamp);

+ /*

+ * There is still an active cleaner (since we're here).

+ * Reset the last seen time for this duplicate entry

+ * so that its lifespand gets extended.

+ */

+ list_remove(&recent_events_list, entry);

+ list_insert_head(&recent_events_list, entry);

+ entry->re_timestamp = now;

+ zfs_zevent_track_duplicate();

+ mutex_exit(&recent_events_lock);

+ return (age <= zfs_zevent_retain_expire_secs);

+ }

+ if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {

+ /* recycle oldest node */

+ entry = list_tail(&recent_events_list);

+ ASSERT(entry != NULL);

+ list_remove(&recent_events_list, entry);

+ avl_remove(&recent_events_tree, entry);

+ } else {

+ entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);

+ }

+ /* record this as a recent ereport */

+ *entry = search;

+ avl_add(&recent_events_tree, entry);

+ list_insert_head(&recent_events_list, entry);

+ entry->re_timestamp = now;

+ /* Start a cleaner if not already scheduled */

+ if (recent_events_cleaner_tqid == 0)

+ zfs_ereport_schedule_cleaner();

+ mutex_exit(&recent_events_lock);

+ return (B_FALSE);

void

zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)

{

@@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,

uint64_t ena;

char class[64];

- if (!zfs_ereport_is_valid(subclass, spa, vd, zio))

- return (B_FALSE);

if ((ereport = fm_nvlist_create(NULL)) == NULL)

return (B_FALSE);

@@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,

DATA_TYPE_UINT64, zio->io_timestamp, NULL);

fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,

DATA_TYPE_UINT64, zio->io_delta, NULL);

+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,

+ DATA_TYPE_UINT32, zio->io_priority, NULL);

* If the 'size' parameter is non-zero, it indicates this is a

@@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)

}

- * Return 0 if event was posted, EINVAL if there was a problem posting it or

- * EBUSY if the event was rate limited.

+ * Post an ereport for the given subclass

+ *

+ * Returns

+ * - 0 if an event was posted

+ * - EINVAL if there was a problem posting event

+ * - EBUSY if the event was rate limited

+ * - EALREADY if the event was already posted (duplicate)

int

zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,

- const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,

- uint64_t size)

+ const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)

{

int rc = 0;

#ifdef _KERNEL

nvlist_t *ereport = NULL;

nvlist_t *detector = NULL;

+ if (!zfs_ereport_is_valid(subclass, spa, vd, zio))

+ return (EINVAL);

+ if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))

+ return (SET_ERROR(EALREADY));

if (zfs_is_ratelimiting_event(subclass, vd))

return (SET_ERROR(EBUSY));

if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,

- zb, zio, stateoroffset, size))

+ zb, zio, state, 0))

return (SET_ERROR(EINVAL)); /* couldn't post event */

if (ereport == NULL)

@@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,

return (rc);

}

-void

+/*

+ * Prepare a checksum ereport

+ *

+ * Returns

+ * - 0 if an event was posted

+ * - EINVAL if there was a problem posting event

+ * - EBUSY if the event was rate limited

+ * - EALREADY if the event was already posted (duplicate)

+ */

+int

zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,

struct zio *zio, uint64_t offset, uint64_t length, void *arg,

zio_bad_cksum_t *info)

@@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,

zio_cksum_report_t *report;

#ifdef _KERNEL

+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))

+ return (SET_ERROR(EINVAL));

+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,

+ offset, length))

+ return (SET_ERROR(EALREADY));

if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))

- return;

+ return (SET_ERROR(EBUSY));

#endif

report = kmem_zalloc(sizeof (*report), KM_SLEEP);

@@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,

if (report->zcr_ereport == NULL) {

zfs_ereport_free_checksum(report);

- return;

+ return (0);

}

#endif

@@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,

report->zcr_next = zio->io_logical->io_cksum_report;

zio->io_logical->io_cksum_report = report;

mutex_exit(&spa->spa_errlist_lock);

+ return (0);

}

void

@@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)

kmem_free(rpt, sizeof (*rpt));

}

+/*

+ * Post a checksum ereport

+ *

+ * Returns

+ * - 0 if an event was posted

+ * - EINVAL if there was a problem posting event

+ * - EBUSY if the event was rate limited

+ * - EALREADY if the event was already posted (duplicate)

+ */

int

zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,

struct zio *zio, uint64_t offset, uint64_t length,

@@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,

nvlist_t *detector = NULL;

zfs_ecksum_info_t *info;

+ if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))

+ return (SET_ERROR(EINVAL));

+ if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,

+ offset, length))

+ return (SET_ERROR(EALREADY));

if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))

- return (EBUSY);

+ return (SET_ERROR(EBUSY));

if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,

spa, vd, zb, zio, offset, length) || (ereport == NULL)) {

@@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)

#endif

}

-#if defined(_KERNEL)

+#ifdef _KERNEL

+void

+zfs_ereport_init(void)

+ mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);

+ list_create(&recent_events_list, sizeof (recent_events_node_t),

+ offsetof(recent_events_node_t, re_list_link));

+ avl_create(&recent_events_tree, recent_events_compare,

+ sizeof (recent_events_node_t), offsetof(recent_events_node_t,

+ re_tree_link));

+/*

+ * This 'early' fini needs to run before zfs_fini() which on Linux waits

+ * for the system_delay_taskq to drain.

+ */

+void

+zfs_ereport_taskq_fini(void)

+ mutex_enter(&recent_events_lock);

+ if (recent_events_cleaner_tqid != 0) {

+ taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);

+ recent_events_cleaner_tqid = 0;

+ }

+ mutex_exit(&recent_events_lock);

+void

+zfs_ereport_fini(void)

+ recent_events_node_t *entry;

+ while ((entry = list_head(&recent_events_list)) != NULL) {

+ avl_remove(&recent_events_tree, entry);

+ list_remove(&recent_events_list, entry);

+ kmem_free(entry, sizeof (*entry));

+ }

+ avl_destroy(&recent_events_tree);

+ list_destroy(&recent_events_list);

+ mutex_destroy(&recent_events_lock);

EXPORT_SYMBOL(zfs_ereport_post);

EXPORT_SYMBOL(zfs_ereport_is_valid);

EXPORT_SYMBOL(zfs_ereport_post_checksum);

EXPORT_SYMBOL(zfs_post_remove);

EXPORT_SYMBOL(zfs_post_autoreplace);

EXPORT_SYMBOL(zfs_post_state_change);

+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,

+ "Maximum recent zevents records to retain for duplicate checking");

+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,

+ "Expiration time for recent zevents records");

#endif /* _KERNEL */

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 495ff4707d77..eff66b32fcb1 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c

@@ -3511,30 +3511,29 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)

* This ioctl is used to set the bootenv configuration on the current

* pool. This configuration is stored in the second padding area of the label,

- * and it is used by the GRUB bootloader used on Linux to store the contents

- * of the grubenv file. The file is stored as raw ASCII, and is protected by

- * an embedded checksum. By default, GRUB will check if the boot filesystem

- * supports storing the environment data in a special location, and if so,

- * will invoke filesystem specific logic to retrieve it. This can be overridden

- * by a variable, should the user so desire.

+ * and it is used by the bootloader(s) to store the bootloader and/or system

+ * specific data.

+ * The data is stored as nvlist data stream, and is protected by

+ * an embedded checksum.

+ * The version can have two possible values:

+ * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING.

+ * VB_NVLIST: nvlist with arbitrary <key, value> pairs.

-/* ARGSUSED */

static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {

- {"envmap", DATA_TYPE_STRING, 0},

+ {"version", DATA_TYPE_UINT64, 0},

+ {"<keys>", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST},

};

static int

zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)

{

- char *envmap;

int error;

spa_t *spa;

- envmap = fnvlist_lookup_string(innvl, "envmap");

if ((error = spa_open(name, &spa, FTAG)) != 0)

return (error);

spa_vdev_state_enter(spa, SCL_ALL);

- error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap);

+ error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl);

(void) spa_vdev_state_exit(spa, NULL, 0);

spa_close(spa, FTAG);

return (error);

@@ -3544,7 +3543,6 @@ static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {

/* no nvl keys */

};

-/* ARGSUSED */

static int

zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)

{

@@ -7615,6 +7613,7 @@ zfs_kmod_fini(void)

kmem_free(zs, sizeof (zfsdev_state_t));

}

+ zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */

zfs_fini();

spa_fini();

zvol_fini();

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f956a9ef7621..8a8fbccd7d63 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c

@@ -20,7 +20,7 @@

@@ -547,7 +547,7 @@ error:

if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {

spa_log_error(spa, &zio->io_bookmark);

(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,

- spa, NULL, &zio->io_bookmark, zio, 0, 0);

+ spa, NULL, &zio->io_bookmark, zio, 0);

}

} else {

zio->io_error = ret;

@@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)

zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,

pio->io_offset, pio->io_size, pio->io_error);

(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,

- pio->io_spa, vd, zb, pio, 0, 0);

+ pio->io_spa, vd, zb, pio, 0);

if (failmode == ZIO_FAILURE_MODE_CONTINUE &&

taskq_empty_ent(&pio->io_tqent)) {

@@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)

"failure and has been suspended.\n", spa_name(spa));

(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,

- NULL, NULL, 0, 0);

+ NULL, NULL, 0);

mutex_enter(&spa->spa_suspend_lock);

@@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio)

zio->io_error = error;

if (error == ECKSUM &&

!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {

- mutex_enter(&zio->io_vd->vdev_stat_lock);

- zio->io_vd->vdev_stat.vs_checksum_errors++;

- mutex_exit(&zio->io_vd->vdev_stat_lock);

- zfs_ereport_start_checksum(zio->io_spa,

+ int ret = zfs_ereport_start_checksum(zio->io_spa,

zio->io_vd, &zio->io_bookmark, zio,

zio->io_offset, zio->io_size, NULL, &info);

+ if (ret != EALREADY) {

+ mutex_enter(&zio->io_vd->vdev_stat_lock);

+ zio->io_vd->vdev_stat.vs_checksum_errors++;

+ mutex_exit(&zio->io_vd->vdev_stat_lock);

+ }

}

@@ -4543,7 +4545,7 @@ zio_done(zio_t *zio)

(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,

zio->io_spa, zio->io_vd, &zio->io_bookmark,

- zio, 0, 0);

+ zio, 0);

}

@@ -4557,16 +4559,16 @@ zio_done(zio_t *zio)

if (zio->io_error != ECKSUM && zio->io_vd != NULL &&

!vdev_is_dead(zio->io_vd)) {

- mutex_enter(&zio->io_vd->vdev_stat_lock);

- if (zio->io_type == ZIO_TYPE_READ) {

- zio->io_vd->vdev_stat.vs_read_errors++;

- } else if (zio->io_type == ZIO_TYPE_WRITE) {

- zio->io_vd->vdev_stat.vs_write_errors++;

+ int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,

+ zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);

+ if (ret != EALREADY) {

+ mutex_enter(&zio->io_vd->vdev_stat_lock);

+ if (zio->io_type == ZIO_TYPE_READ)

+ zio->io_vd->vdev_stat.vs_read_errors++;

+ else if (zio->io_type == ZIO_TYPE_WRITE)

+ zio->io_vd->vdev_stat.vs_write_errors++;

+ mutex_exit(&zio->io_vd->vdev_stat_lock);

}

- mutex_exit(&zio->io_vd->vdev_stat_lock);

- (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,

- zio->io_vd, &zio->io_bookmark, zio, 0, 0);

}

if ((zio->io_error == EIO || !(zio->io_flags &

@@ -4578,7 +4580,7 @@ zio_done(zio_t *zio)

spa_log_error(zio->io_spa, &zio->io_bookmark);

(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,

- zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);

+ zio->io_spa, NULL, &zio->io_bookmark, zio, 0);

}

diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c
index fdc4b863382c..5ac2e30467e3 100644
--- a/module/zfs/zthr.c
+++ b/module/zfs/zthr.c

@@ -56,7 +56,7 @@

* == ZTHR creation

- * Every zthr needs three inputs to start running:

+ * Every zthr needs four inputs to start running:

* 1] A user-defined checker function (checkfunc) that decides whether

* the zthr should start working or go to sleep. The function should

@@ -72,6 +72,9 @@

* 3] A void args pointer that will be passed to checkfunc and func

* implicitly by the infrastructure.

+ * 4] A name for the thread. This string must be valid for the lifetime

+ * of the zthr.

+ *

* The reason why the above API needs two different functions,

* instead of one that both checks and does the work, has to do with

* the zthr's internal state lock (zthr_state_lock) and the allowed

@@ -221,6 +224,7 @@ struct zthr {

zthr_checkfunc_t *zthr_checkfunc;

zthr_func_t *zthr_func;

void *zthr_arg;

+ const char *zthr_name;

};

static void

@@ -237,15 +241,10 @@ zthr_procedure(void *arg)

t->zthr_func(t->zthr_arg, t);

mutex_enter(&t->zthr_state_lock);

} else {

- /*

- * cv_wait_sig() is used instead of cv_wait() in

- * order to prevent this process from incorrectly

- * contributing to the system load average when idle.

- */

if (t->zthr_sleep_timeout == 0) {

- cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock);

+ cv_wait_idle(&t->zthr_cv, &t->zthr_state_lock);

} else {

- (void) cv_timedwait_sig_hires(&t->zthr_cv,

+ (void) cv_timedwait_idle_hires(&t->zthr_cv,

&t->zthr_state_lock, t->zthr_sleep_timeout,

MSEC2NSEC(1), 0);

}

@@ -296,6 +295,7 @@ zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc,

t->zthr_func = func;

t->zthr_arg = arg;

t->zthr_sleep_timeout = max_sleep;

+ t->zthr_name = zthr_name;

t->zthr_thread = thread_create_named(zthr_name, NULL, 0,

zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);

@@ -422,8 +422,8 @@ zthr_resume(zthr_t *t)

* no-op.

if (t->zthr_thread == NULL) {

- t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,

- 0, &p0, TS_RUN, minclsyspri);

+ t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0,

+ zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);

}

mutex_exit(&t->zthr_state_lock);