aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAmeer Hamza <ahamza@ixsystems.com>2022-09-26 23:32:42 +0000
committerBrian Behlendorf <behlendorf1@llnl.gov>2023-03-27 18:32:09 +0000
commitbd9a9a4e1ad34d9d1c63746a662c34fffa89204f (patch)
treeab8a919d71b90002cb5d4fcb2f79107094dda36f
parent5219a2691e3b3de5b69bc3c97b8f58c272d6fe04 (diff)
downloadsrc-bd9a9a4e1ad34d9d1c63746a662c34fffa89204f.tar.gz
src-bd9a9a4e1ad34d9d1c63746a662c34fffa89204f.zip
zed: mark disks as REMOVED when they are removed
ZED does not take any action for disk removal events if there is no spare VDEV available. Added zpool_vdev_remove_wanted() in libzfs and vdev_remove_wanted() in vdev.c to remove the VDEV through ZED on removal event. This means that if you are running zed and remove a disk, it will be propertly marked as REMOVED. Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
-rw-r--r--cmd/zed/agents/zfs_agents.c54
-rw-r--r--cmd/zed/agents/zfs_retire.c36
-rw-r--r--cmd/ztest/ztest.c2
-rw-r--r--config/kernel-blkdev.m455
-rw-r--r--include/libzfs.h1
-rw-r--r--include/os/linux/kernel/linux/blkdev_compat.h26
-rw-r--r--include/os/linux/spl/sys/Makefile.am1
-rw-r--r--include/os/linux/spl/sys/misc.h29
-rw-r--r--include/sys/spa.h2
-rw-r--r--include/sys/vdev.h3
-rw-r--r--include/sys/vdev_impl.h3
-rw-r--r--include/sys/zfs_context.h1
-rw-r--r--lib/libzfs/libzfs.abi6
-rw-r--r--lib/libzfs/libzfs_pool.c37
-rw-r--r--module/os/linux/spl/spl-generic.c33
-rw-r--r--module/os/linux/zfs/vdev_disk.c24
-rw-r--r--module/zfs/spa.c19
-rw-r--r--module/zfs/spa_config.c14
-rw-r--r--module/zfs/spa_misc.c4
-rw-r--r--module/zfs/vdev.c59
-rw-r--r--module/zfs/zfs_ioctl.c6
-rw-r--r--module/zfs/zio.c2
-rw-r--r--tests/zfs-tests/include/libtest.shlib2
-rwxr-xr-xtests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh27
24 files changed, 395 insertions, 51 deletions
diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c
index 35dd818ff80d..c8774010d5eb 100644
--- a/cmd/zed/agents/zfs_agents.c
+++ b/cmd/zed/agents/zfs_agents.c
@@ -80,6 +80,7 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
char *path = NULL;
uint_t c, children;
nvlist_t **child;
+ uint64_t vdev_guid;
/*
* First iterate over any children.
@@ -100,7 +101,7 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
- gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
+ gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
return (B_TRUE);
}
}
@@ -109,7 +110,7 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
- gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
+ gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
return (B_TRUE);
}
}
@@ -126,6 +127,21 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
&gsp->gs_vdev_expandtime);
return (B_TRUE);
}
+ /*
+ * Otherwise, on a vdev guid match, grab the devid and expansion
+ * time. The devid might be missing on removal since its not part
+ * of blkid cache and L2ARC VDEV does not contain pool guid in its
+ * blkid, so this is a special case for L2ARC VDEV.
+ */
+ else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
+ nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
+ gsp->gs_vdev_guid == vdev_guid) {
+ (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
+ &gsp->gs_devid);
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
+ &gsp->gs_vdev_expandtime);
+ return (B_TRUE);
+ }
return (B_FALSE);
}
@@ -148,7 +164,7 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
/*
* if a match was found then grab the pool guid
*/
- if (gsp->gs_vdev_guid) {
+ if (gsp->gs_vdev_guid && gsp->gs_devid) {
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&gsp->gs_pool_guid);
}
@@ -195,11 +211,13 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
uint64_t pool_guid = 0, vdev_guid = 0;
guid_search_t search = { 0 };
device_type_t devtype = DEVICE_TYPE_PRIMARY;
+ char *devid = NULL;
class = "resource.fs.zfs.removed";
subclass = "";
(void) nvlist_add_string(payload, FM_CLASS, class);
+ (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid);
(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
@@ -209,20 +227,24 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
(void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
/*
+ * If devid is missing but vdev_guid is available, find devid
+ * and pool_guid from vdev_guid.
* For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
* ZFS_EV_POOL_GUID may be missing so find them.
*/
- if (pool_guid == 0 || vdev_guid == 0) {
- if ((nvlist_lookup_string(nvl, DEV_IDENTIFIER,
- &search.gs_devid) == 0) &&
- (zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search)
- == 1)) {
- if (pool_guid == 0)
- pool_guid = search.gs_pool_guid;
- if (vdev_guid == 0)
- vdev_guid = search.gs_vdev_guid;
- devtype = search.gs_vdev_type;
- }
+ if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
+ if (devid == NULL)
+ search.gs_vdev_guid = vdev_guid;
+ else
+ search.gs_devid = devid;
+ zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+ if (devid == NULL)
+ devid = search.gs_devid;
+ if (pool_guid == 0)
+ pool_guid = search.gs_pool_guid;
+ if (vdev_guid == 0)
+ vdev_guid = search.gs_vdev_guid;
+ devtype = search.gs_vdev_type;
}
/*
@@ -235,7 +257,9 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
search.gs_vdev_expandtime + 10 > tv.tv_sec) {
zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
"for recently expanded device '%s'", EC_DEV_REMOVE,
- search.gs_devid);
+ devid);
+ fnvlist_free(payload);
+ free(event);
goto out;
}
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index f4063bea7378..3b71a63e0a77 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -323,6 +323,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
boolean_t is_disk;
vdev_aux_t aux;
uint64_t state = 0;
+ int l2arc;
+ vdev_stat_t *vs;
+ unsigned int c;
fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
@@ -351,13 +354,32 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
- /* Can't replace l2arc with a spare: offline the device */
- if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
- &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) {
- fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
- zpool_vdev_offline(zhp, devname, B_TRUE);
- } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
- replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
+ nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c);
+
+ /*
+ * If state removed is requested for already removed vdev,
+ * its a loopback event from spa_async_remove(). Just
+ * ignore it.
+ */
+ if (vs->vs_state == VDEV_STATE_REMOVED &&
+ state == VDEV_STATE_REMOVED)
+ return;
+
+ l2arc = (nvlist_lookup_string(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, &devtype) == 0 &&
+ strcmp(devtype, VDEV_TYPE_L2CACHE) == 0);
+
+ /* Remove the vdev since device is unplugged */
+ if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) {
+ int status = zpool_vdev_remove_wanted(zhp, devname);
+ fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'"
+ ", ret:%d", devname, status);
+ }
+
+ /* Replace the vdev with a spare if its not a l2arc */
+ if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
+ replace_with_spare(hdl, zhp, vdev) == B_FALSE)) {
/* Could not handle with spare */
fmd_hdl_debug(hdl, "no spare for '%s'", devname);
}
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index fb4297478cf1..b7dc3fcc5e51 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -1184,7 +1184,7 @@ ztest_kill(ztest_shared_t *zs)
* See comment above spa_write_cachefile().
*/
mutex_enter(&spa_namespace_lock);
- spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE);
+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE);
mutex_exit(&spa_namespace_lock);
(void) kill(getpid(), SIGKILL);
diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 462d6c6efa8e..28e5364581ea 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -104,6 +104,57 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [
])
dnl #
+dnl # bdev_kobj() is introduced from 5.12
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ], [
+ ZFS_LINUX_TEST_SRC([bdev_kobj], [
+ #include <linux/fs.h>
+ #include <linux/blkdev.h>
+ #include <linux/kobject.h>
+ ], [
+ struct block_device *bdev = NULL;
+ struct kobject *disk_kobj;
+ disk_kobj = bdev_kobj(bdev);
+ ])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ], [
+ AC_MSG_CHECKING([whether bdev_kobj() exists])
+ ZFS_LINUX_TEST_RESULT([bdev_kobj], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_BDEV_KOBJ, 1,
+ [bdev_kobj() exists])
+ ], [
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # part_to_dev() was removed in 5.12
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV], [
+ ZFS_LINUX_TEST_SRC([part_to_dev], [
+ #include <linux/fs.h>
+ #include <linux/blkdev.h>
+ ], [
+ struct hd_struct *p = NULL;
+ struct device *pdev;
+ pdev = part_to_dev(p);
+ ])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV], [
+ AC_MSG_CHECKING([whether part_to_dev() exists])
+ ZFS_LINUX_TEST_RESULT([part_to_dev], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_PART_TO_DEV, 1,
+ [part_to_dev() exists])
+ ], [
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
dnl # 5.10 API, check_disk_change() is removed, in favor of
dnl # bdev_check_media_change(), which doesn't force revalidation
dnl #
@@ -405,6 +456,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
+ ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
+ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
@@ -421,4 +474,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
ZFS_AC_KERNEL_BLKDEV_BDEVNAME
ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
+ ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
+ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
])
diff --git a/include/libzfs.h b/include/libzfs.h
index 98942b41982c..182f3f63e48e 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -307,6 +307,7 @@ extern int zpool_vdev_remove_cancel(zpool_handle_t *);
extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *);
extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
splitflags_t);
+_LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *);
extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index bac5c2279d29..b1daa5ed9ce9 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -257,6 +257,32 @@ bio_set_bi_error(struct bio *bio, int error)
#endif /* HAVE_1ARG_BIO_END_IO_T */
/*
+ * 5.15 MACRO,
+ * GD_DEAD
+ *
+ * 2.6.36 - 5.14 MACRO,
+ * GENHD_FL_UP
+ *
+ * Check the disk status and return B_TRUE if alive
+ * otherwise B_FALSE
+ */
+static inline boolean_t
+zfs_check_disk_status(struct block_device *bdev)
+{
+#if defined(GENHD_FL_UP)
+ return (!!(bdev->bd_disk->flags & GENHD_FL_UP));
+#elif defined(GD_DEAD)
+ return (!test_bit(GD_DEAD, &bdev->bd_disk->state));
+#else
+/*
+ * This is encountered if neither GENHD_FL_UP nor GD_DEAD is available in
+ * the kernel - likely due to an MACRO change that needs to be chased down.
+ */
+#error "Unsupported kernel: no usable disk status check"
+#endif
+}
+
+/*
* 4.1 API,
* 3.10.0 CentOS 7.x API,
* blkdev_reread_part()
diff --git a/include/os/linux/spl/sys/Makefile.am b/include/os/linux/spl/sys/Makefile.am
index 48c27f970fc9..450baffc395e 100644
--- a/include/os/linux/spl/sys/Makefile.am
+++ b/include/os/linux/spl/sys/Makefile.am
@@ -20,6 +20,7 @@ KERNEL_H = \
kmem.h \
kstat.h \
list.h \
+ misc.h \
mod_os.h \
mutex.h \
param.h \
diff --git a/include/os/linux/spl/sys/misc.h b/include/os/linux/spl/sys/misc.h
new file mode 100644
index 000000000000..299fe9c1ab07
--- /dev/null
+++ b/include/os/linux/spl/sys/misc.h
@@ -0,0 +1,29 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#ifndef _OS_LINUX_SPL_MISC_H
+#define _OS_LINUX_SPL_MISC_H
+
+#include <linux/kobject.h>
+
+extern void spl_signal_kobj_evt(struct block_device *bdev);
+
+#endif
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 67724a68f0e8..fedadab459b7 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -837,7 +837,7 @@ extern kmutex_t spa_namespace_lock;
#define SPA_CONFIG_UPDATE_POOL 0
#define SPA_CONFIG_UPDATE_VDEVS 1
-extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
+extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t);
extern void spa_config_load(void);
extern nvlist_t *spa_all_configs(uint64_t *);
extern void spa_config_set(spa_t *spa, nvlist_t *config);
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index f235bfc8cc19..de08bbf16413 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -147,6 +147,7 @@ extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_state_t *);
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
+extern int vdev_remove_wanted(spa_t *spa, uint64_t guid);
extern void vdev_clear(spa_t *spa, vdev_t *vd);
extern boolean_t vdev_is_dead(vdev_t *vd);
@@ -189,6 +190,8 @@ typedef enum vdev_config_flag {
VDEV_CONFIG_MISSING = 1 << 4
} vdev_config_flag_t;
+extern void vdev_post_kobj_evt(vdev_t *vd);
+extern void vdev_clear_kobj_evt(vdev_t *vd);
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
boolean_t getstats, vdev_config_flag_t flags);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index da846d8504fe..9d4a8062b2d9 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -69,6 +69,7 @@ extern uint32_t zfs_vdev_async_write_max_active;
* Virtual device operations
*/
typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd);
+typedef void vdev_kobj_post_evt_func_t(vdev_t *vd);
typedef void vdev_fini_func_t(vdev_t *vd);
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
uint64_t *ashift, uint64_t *pshift);
@@ -123,6 +124,7 @@ typedef const struct vdev_ops {
vdev_config_generate_func_t *vdev_op_config_generate;
vdev_nparity_func_t *vdev_op_nparity;
vdev_ndisks_func_t *vdev_op_ndisks;
+ vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -435,6 +437,7 @@ struct vdev {
boolean_t vdev_isl2cache; /* was a l2cache device */
boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */
boolean_t vdev_resilver_deferred; /* resilver deferred */
+ boolean_t vdev_kobj_flag; /* kobj event record */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index a6ff94317195..235a73d5d782 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -51,6 +51,7 @@ extern "C" {
#include <sys/kmem.h>
#include <sys/kmem_cache.h>
#include <sys/vmem.h>
+#include <sys/misc.h>
#include <sys/taskq.h>
#include <sys/param.h>
#include <sys/disp.h>
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 14e03ee28ffe..605826f70e6f 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -378,6 +378,7 @@
<elf-symbol name='zpool_vdev_path_to_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_vdev_remove' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_vdev_remove_cancel' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='zpool_vdev_remove_wanted' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_vdev_split' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_wait' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_wait_status' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -5858,6 +5859,11 @@
<parameter type-id='c19b74c3' name='istmp'/>
<return type-id='95e97e5e'/>
</function-decl>
+ <function-decl name='zpool_vdev_remove_wanted' mangled-name='zpool_vdev_remove_wanted' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_vdev_remove_wanted'>
+ <parameter type-id='4c81de99' name='zhp'/>
+ <parameter type-id='80f4b756' name='path'/>
+ <return type-id='95e97e5e'/>
+ </function-decl>
<function-decl name='zpool_vdev_fault' mangled-name='zpool_vdev_fault' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_vdev_fault'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='9c313c2d' name='guid'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index e43ebb15c608..c8659c5fe2e3 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -3159,6 +3159,43 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
}
/*
+ * Remove the specified vdev asynchronously from the configuration, so
+ * that it may come ONLINE if reinserted. This is called from zed on
+ * Udev remove event.
+ * Note: We also have a similar function zpool_vdev_remove() that
+ * removes the vdev from the pool.
+ */
+int
+zpool_vdev_remove_wanted(zpool_handle_t *zhp, const char *path)
+{
+ zfs_cmd_t zc = {"\0"};
+ char errbuf[1024];
+ nvlist_t *tgt;
+ boolean_t avail_spare, l2cache;
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
+
+ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
+ NULL)) == NULL)
+ return (zfs_error(hdl, EZFS_NODEVICE, errbuf));
+
+ zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
+
+ if (avail_spare)
+ return (zfs_error(hdl, EZFS_ISSPARE, errbuf));
+
+ zc.zc_cookie = VDEV_STATE_REMOVED;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ return (0);
+
+ return (zpool_standard_error(hdl, errno, errbuf));
+}
+
+/*
* Mark the given vdev faulted.
*/
int
diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c
index 5ea4fc635165..508fb9d4c7f7 100644
--- a/module/os/linux/spl/spl-generic.c
+++ b/module/os/linux/spl/spl-generic.c
@@ -48,6 +48,7 @@
#include <linux/mod_compat.h>
#include <sys/cred.h>
#include <sys/vnode.h>
+#include <sys/misc.h>
char spl_gitrev[64] = ZFS_META_GITREV;
@@ -540,6 +541,38 @@ ddi_copyin(const void *from, void *to, size_t len, int flags)
}
EXPORT_SYMBOL(ddi_copyin);
+/*
+ * Post a uevent to userspace whenever a new vdev adds to the pool. It is
+ * necessary to sync blkid information with udev, which zed daemon uses
+ * during device hotplug to identify the vdev.
+ */
+void
+spl_signal_kobj_evt(struct block_device *bdev)
+{
+#if defined(HAVE_BDEV_KOBJ) || defined(HAVE_PART_TO_DEV)
+#ifdef HAVE_BDEV_KOBJ
+ struct kobject *disk_kobj = bdev_kobj(bdev);
+#else
+ struct kobject *disk_kobj = &part_to_dev(bdev->bd_part)->kobj;
+#endif
+ if (disk_kobj) {
+ int ret = kobject_uevent(disk_kobj, KOBJ_CHANGE);
+ if (ret) {
+ pr_warn("ZFS: Sending event '%d' to kobject: '%s'"
+ " (%p): failed(ret:%d)\n", KOBJ_CHANGE,
+ kobject_name(disk_kobj), disk_kobj, ret);
+ }
+ }
+#else
+/*
+ * This is encountered if neither bdev_kobj() nor part_to_dev() is available
+ * in the kernel - likely due to an API change that needs to be chased down.
+ */
+#error "Unsupported kernel: unable to get struct kobj from bdev"
+#endif
+}
+EXPORT_SYMBOL(spl_signal_kobj_evt);
+
int
ddi_copyout(const void *from, void *to, size_t len, int flags)
{
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 2f84792d89be..60b111c59f23 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -179,6 +179,18 @@ vdev_disk_error(zio_t *zio)
zio->io_flags);
}
+static void
+vdev_disk_kobj_evt_post(vdev_t *v)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+ if (vd && vd->vd_bdev) {
+ spl_signal_kobj_evt(vd->vd_bdev);
+ } else {
+ vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n",
+ v->vdev_path);
+ }
+}
+
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
@@ -290,6 +302,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
zfs_vdev_holder);
if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+ /*
+ * There is no point of waiting since device is removed
+ * explicitly
+ */
+ if (v->vdev_removed)
+ break;
+
schedule_timeout(MSEC_TO_TICK(10));
} else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) {
timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
@@ -899,7 +918,7 @@ vdev_disk_io_done(zio_t *zio)
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
- if (zfs_check_media_change(vd->vd_bdev)) {
+ if (!zfs_check_disk_status(vd->vd_bdev)) {
invalidate_bdev(vd->vd_bdev);
v->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
@@ -955,7 +974,8 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
- .vdev_op_leaf = B_TRUE /* leaf vdev */
+ .vdev_op_leaf = B_TRUE, /* leaf vdev */
+ .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
};
/*
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c9759f35a6fb..2625479f19d5 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -909,7 +909,16 @@ spa_change_guid(spa_t *spa)
spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
if (error == 0) {
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ /*
+ * Clear the kobj flag from all the vdevs to allow
+ * vdev_cache_process_kobj_evt() to post events to all the
+ * vdevs since GUID is updated.
+ */
+ vdev_clear_kobj_evt(spa->spa_root_vdev);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+ vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
+
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
}
@@ -5192,7 +5201,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
*/
spa_unload(spa);
spa_deactivate(spa);
- spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
spa_remove(spa);
if (locked)
mutex_exit(&spa_namespace_lock);
@@ -6012,7 +6021,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_spawn_aux_threads(spa);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
/*
* Don't count references from objsets that are already closed
@@ -6073,7 +6082,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
if (props != NULL)
spa_configfile_set(spa, props, B_FALSE);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
zfs_dbgmsg("spa_import: verbatim import of %s", pool);
mutex_exit(&spa_namespace_lock);
@@ -6465,7 +6474,7 @@ export_spa:
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
- spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
spa_remove(spa);
} else {
/*
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index ad82932ce567..432f8b8f3696 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -238,7 +238,8 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
* would be required.
*/
void
-spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent,
+ boolean_t postblkidevent)
{
spa_config_dirent_t *dp, *tdp;
nvlist_t *nvl;
@@ -344,6 +345,16 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
if (postsysevent)
spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
+
+ /*
+ * Post udev event to sync blkid information if the pool is created
+ * or a new vdev is added to the pool.
+ */
+ if ((target->spa_root_vdev) && postblkidevent) {
+ vdev_post_kobj_evt(target->spa_root_vdev);
+ for (int i = 0; i < target->spa_l2cache.sav_count; i++)
+ vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]);
+ }
}
/*
@@ -598,6 +609,7 @@ spa_config_update(spa_t *spa, int what)
*/
if (!spa->spa_is_root) {
spa_write_cachefile(spa, B_FALSE,
+ what != SPA_CONFIG_UPDATE_POOL,
what != SPA_CONFIG_UPDATE_POOL);
}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 1c93e7487dda..4461b985fd3b 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1291,7 +1291,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
* If the config changed, update the config cache.
*/
if (config_changed)
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
}
/*
@@ -1386,7 +1386,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
*/
if (config_changed) {
mutex_enter(&spa_namespace_lock);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
mutex_exit(&spa_namespace_lock);
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 00773f89cf6e..4b9d7e7c0506 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1938,6 +1938,14 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
&logical_ashift, &physical_ashift);
+
+ /* Keep the device in removed state if unplugged */
+ if (error == ENOENT && vd->vdev_removed) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
+ VDEV_AUX_NONE);
+ return (error);
+ }
+
/*
* Physical volume size should never be larger than its max size, unless
* the disk has shrunk while we were reading it or the device is buggy
@@ -3156,6 +3164,34 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
mutex_exit(&vd->vdev_dtl_lock);
}
+/*
+ * Iterate over all the vdevs except spare, and post kobj events
+ */
+void
+vdev_post_kobj_evt(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_kobj_evt_post &&
+ vd->vdev_kobj_flag == B_FALSE) {
+ vd->vdev_kobj_flag = B_TRUE;
+ vd->vdev_ops->vdev_op_kobj_evt_post(vd);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_post_kobj_evt(vd->vdev_child[c]);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and clear kobj events
+ */
+void
+vdev_clear_kobj_evt(vdev_t *vd)
+{
+ vd->vdev_kobj_flag = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_clear_kobj_evt(vd->vdev_child[c]);
+}
+
int
vdev_dtl_load(vdev_t *vd)
{
@@ -3936,6 +3972,29 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
return (spa_vdev_state_exit(spa, vd, 0));
}
+int
+vdev_remove_wanted(spa_t *spa, uint64_t guid)
+{
+ vdev_t *vd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ /*
+ * If the vdev is already removed, then don't do anything.
+ */
+ if (vd->vdev_removed)
+ return (spa_vdev_state_exit(spa, NULL, 0));
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_REMOVE);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+
/*
* Online the given vdev.
*
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 4601ef52788a..a4b391cbea12 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1921,6 +1921,10 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
break;
+ case VDEV_STATE_REMOVED:
+ error = vdev_remove_wanted(spa, zc->zc_guid);
+ break;
+
default:
error = SET_ERROR(EINVAL);
}
@@ -2928,7 +2932,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
}
mutex_exit(&spa_namespace_lock);
if (spa != NULL) {
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index c1fd2de2e586..700f8791045f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3928,7 +3928,7 @@ zio_vdev_io_done(zio_t *zio)
ops->vdev_op_io_done(zio);
- if (unexpected_error)
+ if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
VERIFY(vdev_probe(vd, zio) == NULL);
return (zio);
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 079272811f2f..89c6382dedc3 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -2258,7 +2258,7 @@ function check_slog_state # pool disk state{online,offline,unavail}
#
# Return 0 is pool/disk matches expected state, 1 otherwise
#
-function check_vdev_state # pool disk state{online,offline,unavail}
+function check_vdev_state # pool disk state{online,offline,unavail,removed}
{
typeset pool=$1
typeset disk=${2#*$DEV_DSKDIR/}
diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh
index d7189f298384..78eed0f4ce89 100755
--- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh
@@ -24,29 +24,28 @@
#
# DESCRIPTION:
-# Testing Fault Management Agent ZED Logic - Physically removed device is
-# made unavail and onlined when reattached
+# Testing Fault Management Agent ZED Logic - Physically detached device is
+# made removed and onlined when reattached
#
# STRATEGY:
# 1. Create a pool
# 2. Simulate physical removal of one device
-# 3. Verify the device is unavailable
+# 3. Verify the device is removed when detached
# 4. Reattach the device
# 5. Verify the device is onlined
# 6. Repeat the same tests with a spare device:
# zed will use the spare to handle the removed data device
# 7. Repeat the same tests again with a faulted spare device:
-# the removed data device should be unavailable
+# the removed data device should be removed
#
# NOTE: the use of 'block_device_wait' throughout the test helps avoid race
# conditions caused by mixing creation/removal events from partitioning the
# disk (zpool create) and events from physically removing it (remove_disk).
#
-# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a
-# vdev to the unavailable state. The ZED does receive a removal notification
-# but only relies on it to activate a hot spare. Additional work is planned
-# to extend an existing ioctl interface to allow the ZED to transition the
-# vdev in to a removed state.
+# NOTE: the test relies on ZED to transit state to removed on device removed
+# event. The ZED does receive a removal notification but only relies on it to
+# activate a hot spare. Additional work is planned to extend an existing ioctl
+# interface to allow the ZED to transition the vdev in to a removed state.
#
verify_runnable "both"
@@ -104,8 +103,8 @@ do
log_must mkfile 1m $mntpnt/file
log_must zpool sync $TESTPOOL
- # 3. Verify the device is unavailable.
- log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
+ # 3. Verify the device is removed.
+ log_must wait_vdev_state $TESTPOOL $removedev "REMOVED"
# 4. Reattach the device
insert_disk $removedev
@@ -143,7 +142,7 @@ do
# 3. Verify the device is handled by the spare.
log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
- log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
+ log_must wait_vdev_state $TESTPOOL $removedev "REMOVED"
# 4. Reattach the device
insert_disk $removedev
@@ -178,8 +177,8 @@ do
log_must mkfile 1m $mntpnt/file
log_must zpool sync $TESTPOOL
- # 4. Verify the device is unavailable
- log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
+ # 4. Verify the device is removed
+ log_must wait_vdev_state $TESTPOOL $removedev "REMOVED"
# 5. Reattach the device
insert_disk $removedev