aboutsummaryrefslogtreecommitdiff
path: root/stand/libsa/zfs/zfsimpl.c
diff options
context:
space:
mode:
Diffstat (limited to 'stand/libsa/zfs/zfsimpl.c')
-rw-r--r--stand/libsa/zfs/zfsimpl.c227
1 files changed, 146 insertions, 81 deletions
diff --git a/stand/libsa/zfs/zfsimpl.c b/stand/libsa/zfs/zfsimpl.c
index 971d71d098d3..f15d9b016068 100644
--- a/stand/libsa/zfs/zfsimpl.c
+++ b/stand/libsa/zfs/zfsimpl.c
@@ -107,11 +107,6 @@ typedef struct indirect_vsd {
} indirect_vsd_t;
/*
- * List of all vdevs, chained through v_alllink.
- */
-static vdev_list_t zfs_vdevs;
-
-/*
* List of supported read-incompatible ZFS features. Do not add here features
* marked as ZFEATURE_FLAG_READONLY_COMPAT, they are irrelevant for read-only!
*/
@@ -167,7 +162,6 @@ vdev_indirect_mapping_entry_phys_t *
static void
zfs_init(void)
{
- STAILQ_INIT(&zfs_vdevs);
STAILQ_INIT(&zfs_pools);
dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
@@ -839,16 +833,27 @@ vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
return (kid->v_read(kid, bp, buf, offset, bytes));
}
+/*
+ * List of vdevs that were fully initialized from their own label, but later a
+ * newer label was found that obsoleted the stale label, freeing its
+ * configuration tree. We keep those vdevs around, since a new configuration
+ * may include them.
+ */
+static vdev_list_t orphans = STAILQ_HEAD_INITIALIZER(orphans);
+
static vdev_t *
-vdev_find(uint64_t guid)
+vdev_find(vdev_list_t *list, uint64_t guid)
{
- vdev_t *vdev;
+ vdev_t *vdev, *safe;
- STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
+ STAILQ_FOREACH_SAFE(vdev, list, v_childlink, safe) {
if (vdev->v_guid == guid)
return (vdev);
+ if ((vdev = vdev_find(&vdev->v_children, guid)) != NULL)
+ return (vdev);
+ }
- return (0);
+ return (NULL);
}
static vdev_t *
@@ -857,6 +862,11 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
vdev_t *vdev;
vdev_indirect_config_t *vic;
+ if ((vdev = vdev_find(&orphans, guid))) {
+ STAILQ_REMOVE(&orphans, vdev, vdev, v_childlink);
+ return (vdev);
+ }
+
vdev = calloc(1, sizeof(vdev_t));
if (vdev != NULL) {
STAILQ_INIT(&vdev->v_children);
@@ -871,7 +881,6 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
if (_read != NULL) {
vic = &vdev->vdev_indirect_config;
vic->vic_prev_indirect_vdev = UINT64_MAX;
- STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
}
}
@@ -1035,22 +1044,19 @@ vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp)
* STAILQ_INSERT_AFTER.
*/
static vdev_t *
-vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
+vdev_find_previous(vdev_t *top_vdev, uint64_t id)
{
vdev_t *v, *previous;
- if (STAILQ_EMPTY(&top_vdev->v_children))
- return (NULL);
-
previous = NULL;
STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
- if (v->v_id > vdev->v_id)
+ if (v->v_id > id)
return (previous);
- if (v->v_id == vdev->v_id)
+ if (v->v_id == id)
return (v);
- if (v->v_id < vdev->v_id)
+ if (v->v_id < id)
previous = v;
}
return (previous);
@@ -1072,7 +1078,7 @@ vdev_child_count(vdev_t *vdev)
/*
* Insert vdev into top_vdev children list. List is ordered by v_id.
*/
-static void
+static vdev_t *
vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
{
vdev_t *previous;
@@ -1085,7 +1091,7 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
* so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
* as STAILQ does not have insert before.
*/
- previous = vdev_find_previous(top_vdev, vdev);
+ previous = vdev_find_previous(top_vdev, vdev->v_id);
if (previous == NULL) {
STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
@@ -1094,7 +1100,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
* This vdev was configured from label config,
* do not insert duplicate.
*/
- return;
+ free(vdev);
+ return (previous);
} else {
STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
v_childlink);
@@ -1103,26 +1110,28 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
count = vdev_child_count(top_vdev);
if (top_vdev->v_nchildren < count)
top_vdev->v_nchildren = count;
+ return (vdev);
}
static int
-vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
- const nvlist_t *nvlist)
+vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t label_guid,
+ uint64_t txg, const nvlist_t *nvlist)
{
vdev_t *top_vdev, *vdev;
nvlist_t **kids = NULL;
int rc, nkids;
/* Get top vdev. */
- top_vdev = vdev_find(top_guid);
+ top_vdev = vdev_find(&spa->spa_root_vdev->v_children, top_guid);
if (top_vdev == NULL) {
rc = vdev_init(top_guid, nvlist, &top_vdev);
if (rc != 0)
return (rc);
top_vdev->v_spa = spa;
top_vdev->v_top = top_vdev;
+ top_vdev->v_label = label_guid;
top_vdev->v_txg = txg;
- vdev_insert(spa->spa_root_vdev, top_vdev);
+ (void )vdev_insert(spa->spa_root_vdev, top_vdev);
}
/* Add children if there are any. */
@@ -1143,7 +1152,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
vdev->v_spa = spa;
vdev->v_top = top_vdev;
- vdev_insert(top_vdev, vdev);
+ vdev = vdev_insert(top_vdev, vdev);
}
} else {
/*
@@ -1162,30 +1171,6 @@ done:
return (rc);
}
-static int
-vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
-{
- uint64_t pool_guid, top_guid, txg;
- nvlist_t *vdevs;
- int rc;
-
- if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
- NULL, &pool_guid, NULL) ||
- nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
- NULL, &top_guid, NULL) ||
- nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
- NULL, &txg, NULL) != 0 ||
- nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
- NULL, &vdevs, NULL)) {
- printf("ZFS: can't find vdev details\n");
- return (ENOENT);
- }
-
- rc = vdev_from_nvlist(spa, top_guid, txg, vdevs);
- nvlist_destroy(vdevs);
- return (rc);
-}
-
static void
vdev_set_state(vdev_t *vdev)
{
@@ -1232,14 +1217,14 @@ vdev_set_state(vdev_t *vdev)
}
static int
-vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
+vdev_update_from_nvlist(vdev_t *root, uint64_t top_guid, const nvlist_t *nvlist)
{
vdev_t *vdev;
nvlist_t **kids = NULL;
int rc, nkids;
/* Update top vdev. */
- vdev = vdev_find(top_guid);
+ vdev = vdev_find(&root->v_children, top_guid);
if (vdev != NULL)
vdev_set_initial_state(vdev, nvlist);
@@ -1255,7 +1240,7 @@ vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
if (rc != 0)
break;
- vdev = vdev_find(guid);
+ vdev = vdev_find(&root->v_children, guid);
if (vdev != NULL)
vdev_set_initial_state(vdev, kids[i]);
}
@@ -1271,10 +1256,6 @@ vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
return (rc);
}
-/*
- * Shall not be called on root vdev, that is not linked into zfs_vdevs.
- * See comment in vdev_create().
- */
static void
vdev_free(struct vdev *vdev)
{
@@ -1282,8 +1263,10 @@ vdev_free(struct vdev *vdev)
STAILQ_FOREACH_SAFE(kid, &vdev->v_children, v_childlink, safe)
vdev_free(kid);
- STAILQ_REMOVE(&zfs_vdevs, vdev, vdev, v_alllink);
- free(vdev);
+ if (vdev->v_phys_read != NULL)
+ STAILQ_INSERT_HEAD(&orphans, vdev, v_childlink);
+ else
+ free(vdev);
}
static int
@@ -1329,15 +1312,16 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
NULL, &guid, NULL);
if (rc != 0)
break;
- vdev = vdev_find(guid);
+ vdev = vdev_find(&spa->spa_root_vdev->v_children, guid);
/*
* Top level vdev is missing, create it.
* XXXGL: how can this happen?
*/
if (vdev == NULL)
- rc = vdev_from_nvlist(spa, guid, 0, kids[i]);
+ rc = vdev_from_nvlist(spa, guid, 0, 0, kids[i]);
else
- rc = vdev_update_from_nvlist(guid, kids[i]);
+ rc = vdev_update_from_nvlist(spa->spa_root_vdev, guid,
+ kids[i]);
if (rc != 0)
break;
}
@@ -1355,6 +1339,53 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
return (rc);
}
+static bool
+nvlist_find_child_guid(const nvlist_t *nvlist, uint64_t guid)
+{
+ nvlist_t **kids = NULL;
+ int nkids, i;
+ bool rv = false;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+ &nkids, &kids, NULL) != 0)
+ nkids = 0;
+
+ for (i = 0; i < nkids; i++) {
+ uint64_t kid_guid;
+
+ if (nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
+ NULL, &kid_guid, NULL) != 0)
+ break;
+ if (kid_guid == guid)
+ rv = true;
+ else
+ rv = nvlist_find_child_guid(kids[i], guid);
+ if (rv)
+ break;
+ }
+
+ for (i = 0; i < nkids; i++)
+ nvlist_destroy(kids[i]);
+ free(kids);
+
+ return (rv);
+}
+
+static bool
+nvlist_find_vdev_guid(const nvlist_t *nvlist, uint64_t guid)
+{
+ nvlist_t *vdevs;
+ bool rv;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL,
+ &vdevs, NULL) != 0)
+ return (false);
+ rv = nvlist_find_child_guid(vdevs, guid);
+ nvlist_destroy(vdevs);
+
+ return (rv);
+}
+
static spa_t *
spa_find_by_guid(uint64_t guid)
{
@@ -2023,8 +2054,8 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
{
vdev_t vtmp;
spa_t *spa;
- vdev_t *vdev;
- nvlist_t *nvl;
+ vdev_t *vdev, *top;
+ nvlist_t *nvl, *vdevs;
uint64_t val;
uint64_t guid, pool_guid, top_guid, txg;
const char *pool_name;
@@ -2083,6 +2114,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
NULL, &txg, NULL) != 0 ||
+ txg == 0 ||
nvlist_find(nvl, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
NULL, &top_guid, NULL) != 0 ||
nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
@@ -2092,7 +2124,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid, NULL) != 0) {
/*
- * Cache and spare devices end up here - just ignore
+ * Cache, spare and replaced devices end up here - just ignore
* them.
*/
nvlist_destroy(nvl);
@@ -2119,22 +2151,47 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
nvlist_destroy(nvl);
return (ENOMEM);
}
- } else {
- struct vdev *kid;
-
- STAILQ_FOREACH(kid, &spa->spa_root_vdev->v_children,
- v_childlink)
- if (kid->v_guid == top_guid && kid->v_txg < txg) {
- printf("ZFS: pool %s vdev %s ignoring stale "
- "label from txg 0x%jx, using 0x%jx@0x%jx\n",
- spa->spa_name, kid->v_name,
- kid->v_txg, guid, txg);
+ }
+
+ /*
+ * Check if configuration is already known. If configuration is known
+ * and txg numbers don't match, we got 2x2 scenarios here. First, is
+ * the label being read right now _newer_ than the one read before.
+ * Second, is the vdev that provided the stale label _present_ in the
+ * newer configuration. If neither is true, we completely ignore the
+ * label.
+ */
+ STAILQ_FOREACH(top, &spa->spa_root_vdev->v_children, v_childlink)
+ if (top->v_guid == top_guid) {
+ bool newer, present;
+
+ if (top->v_txg == txg)
+ break;
+ newer = (top->v_txg < txg);
+ present = newer ?
+ nvlist_find_vdev_guid(nvl, top->v_label) :
+ (vdev_find(&top->v_children, guid) != NULL);
+ printf("ZFS: pool %s vdev %s %s stale label from "
+ "0x%jx@0x%jx, %s 0x%jx@0x%jx\n",
+ spa->spa_name, top->v_name,
+ present ? "using" : "ignoring",
+ newer ? top->v_label : guid,
+ newer ? top->v_txg : txg,
+ present ? "referred by" : "using",
+ newer ? guid : top->v_label,
+ newer ? txg : top->v_txg);
+ if (newer) {
STAILQ_REMOVE(&spa->spa_root_vdev->v_children,
- kid, vdev, v_childlink);
- vdev_free(kid);
+ top, vdev, v_childlink);
+ vdev_free(top);
+ break;
+ } else if (present) {
break;
+ } else {
+ nvlist_destroy(nvl);
+ return (EIO);
}
- }
+ }
/*
* Get the vdev tree and create our in-core copy of it.
@@ -2142,14 +2199,22 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
* be some kind of alias (overlapping slices, dangerously dedicated
* disks etc).
*/
- vdev = vdev_find(guid);
+ vdev = vdev_find(&spa->spa_root_vdev->v_children, guid);
/* Has this vdev already been inited? */
if (vdev && vdev->v_phys_read) {
nvlist_destroy(nvl);
return (EIO);
}
- rc = vdev_init_from_label(spa, nvl);
+ if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL,
+ &vdevs, NULL)) {
+ printf("ZFS: can't find vdev details\n");
+ nvlist_destroy(nvl);
+ return (ENOENT);
+ }
+
+ rc = vdev_from_nvlist(spa, top_guid, guid, txg, vdevs);
+ nvlist_destroy(vdevs);
nvlist_destroy(nvl);
if (rc != 0)
return (rc);
@@ -2158,7 +2223,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
* We should already have created an incomplete vdev for this
* vdev. Find it and initialise it with our read proc.
*/
- vdev = vdev_find(guid);
+ vdev = vdev_find(&spa->spa_root_vdev->v_children, guid);
if (vdev != NULL) {
vdev->v_phys_read = _read;
vdev->v_phys_write = _write;