diff options
Diffstat (limited to 'sys/contrib/openzfs/cmd/zed/zed_disk_event.c')
-rw-r--r-- | sys/contrib/openzfs/cmd/zed/zed_disk_event.c | 416 |
1 files changed, 416 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.c b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c new file mode 100644 index 000000000000..174d24523253 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017, Intel Corporation. + */ + +#ifdef HAVE_LIBUDEV + +#include <errno.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libudev.h> +#include <libzfs.h> +#include <libzutil.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> + +#include "zed_log.h" +#include "zed_disk_event.h" +#include "agents/zfs_agents.h" + +/* + * Portions of ZED need to see disk events for disks belonging to ZFS pools. + * A libudev monitor is established to monitor block device actions and pass + * them on to internal ZED logic modules. Initially, zfs_mod.c is the only + * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM + * module responsible for handling disk events for ZFS. + */ + +pthread_t g_mon_tid; +struct udev *g_udev; +struct udev_monitor *g_mon; + + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* 64MB is minimum usable disk for ZFS */ +#define MINIMUM_SECTORS 131072 + + +/* + * Post disk event to SLM module + * + * occurs in the context of monitor thread + */ +static void +zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + char *strval; + uint64_t numval; + + zed_log_msg(LOG_INFO, "zed_disk_event:"); + zed_log_msg(LOG_INFO, "\tclass: %s", class); + zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); + if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); + if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); + if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); + + (void) zfs_agent_post_event(class, subclass, nvl); +} + +/* + * dev_event_nvlist: place event schema into an nv pair list + * + * NAME VALUE (example) + * -------------- -------------------------------------------------------- + * DEV_NAME /dev/sdl + * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... + * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC + * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 + * DEV_IS_PART --- + * DEV_SIZE 500107862016 + * ZFS_EV_POOL_GUID 17523635698032189180 + * ZFS_EV_VDEV_GUID 14663607734290803088 + */ +static nvlist_t * +dev_event_nvlist(struct udev_device *dev) +{ + nvlist_t *nvl; + char strval[128]; + const char *value, *path; + uint64_t guid; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); + if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); + if ((path = udev_device_get_devnode(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_NAME, path); + if ((value = udev_device_get_devpath(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_PATH, value); + value = udev_device_get_devtype(dev); + if ((value != NULL && strcmp("partition", value) == 0) || + (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") + != NULL)) { + (void) nvlist_add_boolean(nvl, DEV_IS_PART); + } + if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + } + + /* + * Grab the pool and vdev guids from blkid cache + */ + value = udev_device_get_property_value(dev, "ID_FS_UUID"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); + + value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); + + /* + * Either a vdev guid or a devid must be present for matching + */ + if (!nvlist_exists(nvl, DEV_IDENTIFIER) && + !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { + nvlist_free(nvl); + return (NULL); + } + + return (nvl); +} + +/* + * Listen for block device uevents + */ +static void * +zed_udev_monitor(void *arg) +{ + struct udev_monitor *mon = arg; + char *tmp, *tmp2; + + zed_log_msg(LOG_INFO, "Waiting for new udev disk events..."); + + while (1) { + struct udev_device *dev; + const char *action, *type, *part, *sectors; + const char *bus, *uuid; + const char *class, *subclass; + nvlist_t *nvl; + boolean_t is_zfs = B_FALSE; + + /* allow a cancellation while blocked (recvmsg) */ + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + /* blocks at recvmsg until an event occurs */ + if ((dev = udev_monitor_receive_device(mon)) == NULL) { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " + "device error %d", errno); + continue; + } + + /* allow all steps to complete before a cancellation */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* + * Strongly typed device is the preferred filter + */ + type = udev_device_get_property_value(dev, "ID_FS_TYPE"); + if (type != NULL && type[0] != '\0') { + if (strcmp(type, "zfs_member") == 0) { + is_zfs = B_TRUE; + } else { + /* not ours, so skip */ + zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " + "%s (in use by %s)", + udev_device_get_devnode(dev), type); + udev_device_unref(dev); + continue; + } + } + + /* + * if this is a disk and it is partitioned, then the + * zfs label will reside in a DEVTYPE=partition and + * we can skip passing this event + */ + type = udev_device_get_property_value(dev, "DEVTYPE"); + part = udev_device_get_property_value(dev, + "ID_PART_TABLE_TYPE"); + if (type != NULL && type[0] != '\0' && + strcmp(type, "disk") == 0 && + part != NULL && part[0] != '\0') { + /* skip and wait for partition event */ + udev_device_unref(dev); + continue; + } + + /* + * ignore small partitions + */ + sectors = udev_device_get_property_value(dev, + "ID_PART_ENTRY_SIZE"); + if (sectors == NULL) + sectors = udev_device_get_sysattr_value(dev, "size"); + if (sectors != NULL && + strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + udev_device_unref(dev); + continue; + } + + /* + * If the blkid probe didn't find ZFS, then a persistent + * device id string is required in the message schema + * for matching with vdevs. Preflight here for expected + * udev information. + */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (!is_zfs && (bus == NULL && uuid == NULL)) { + zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " + "source", udev_device_get_devnode(dev)); + udev_device_unref(dev); + continue; + } + + action = udev_device_get_action(dev); + if (strcmp(action, "add") == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else if (strcmp(action, "remove") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } else if (strcmp(action, "change") == 0) { + class = EC_DEV_STATUS; + subclass = ESC_DEV_DLE; + } else { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", + action); + udev_device_unref(dev); + continue; + } + + /* + * Special case an EC_DEV_ADD for multipath devices + * + * When a multipath device is created, udev reports the + * following: + * + * 1. "add" event of the dm device for the multipath device + * (like /dev/dm-3). + * 2. "change" event to create the actual multipath device + * symlink (like /dev/mapper/mpatha). The event also + * passes back the relevant DM vars we care about, like + * DM_UUID. + * 3. Another "change" event identical to #2 (that we ignore). + * + * To get the behavior we want, we treat the "change" event + * in #2 as a "add" event; as if "/dev/mapper/mpatha" was + * a new disk being added. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "DM_UUID") && + udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { + tmp = (char *)udev_device_get_devnode(dev); + tmp2 = zfs_get_underlying_path(tmp); + if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { + /* + * We have a real underlying device, which + * means that this multipath "change" event is + * an "add" event. + * + * If the multipath device and the underlying + * dev are the same name (i.e. /dev/dm-5), then + * there is no real underlying disk for this + * multipath device, and so this "change" event + * really is a multipath removal. + */ + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else { + tmp = (char *) + udev_device_get_property_value(dev, + "DM_NR_VALID_PATHS"); + /* treat as a multipath remove */ + if (tmp != NULL && strcmp(tmp, "0") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } + } + free(tmp2); + } + + /* + * Special case an EC_DEV_ADD for scsi_debug devices + * + * These devices require a udevadm trigger command after + * creation in order to register the vdev_id scsidebug alias + * rule (adds a persistent path (phys_path) used for fault + * management automated tests in the ZFS test suite. + * + * After udevadm trigger command, event registers as a "change" + * event but needs to instead be handled as another "add" event + * to allow for disk labeling and partitioning to occur. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "ID_VDEV") && + udev_device_get_property_value(dev, "ID_MODEL")) { + const char *id_model, *id_model_sd = "scsi_debug"; + + id_model = udev_device_get_property_value(dev, + "ID_MODEL"); + if (strcmp(id_model, id_model_sd) == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } + } + + if ((nvl = dev_event_nvlist(dev)) != NULL) { + zed_udev_event(class, subclass, nvl); + nvlist_free(nvl); + } + + udev_device_unref(dev); + } + + return (NULL); +} + +int +zed_disk_event_init() +{ + int fd, fflags; + + if ((g_udev = udev_new()) == NULL) { + zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); + return (-1); + } + + /* Set up a udev monitor for block devices */ + g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", + "partition"); + udev_monitor_enable_receiving(g_mon); + + /* Make sure monitoring socket is blocking */ + fd = udev_monitor_get_fd(g_mon); + if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) + (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); + + /* spawn a thread to monitor events */ + if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { + udev_monitor_unref(g_mon); + udev_unref(g_udev); + zed_log_msg(LOG_WARNING, "pthread_create failed"); + return (-1); + } + + zed_log_msg(LOG_INFO, "zed_disk_event_init"); + + return (0); +} + +void +zed_disk_event_fini() +{ + /* cancel monitor thread at recvmsg() */ + (void) pthread_cancel(g_mon_tid); + (void) pthread_join(g_mon_tid, NULL); + + /* cleanup udev resources */ + udev_monitor_unref(g_mon); + udev_unref(g_udev); + + zed_log_msg(LOG_INFO, "zed_disk_event_fini"); +} + +#else + +#include "zed_disk_event.h" + +int +zed_disk_event_init() +{ + return (0); +} + +void +zed_disk_event_fini() +{ +} + +#endif /* HAVE_LIBUDEV */ |