aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/spa_errlog.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/spa_errlog.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_errlog.c910
1 files changed, 845 insertions, 65 deletions
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
index c6b28ea7d1b8..9e5d1de63c0b 100644
--- a/sys/contrib/openzfs/module/zfs/spa_errlog.c
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -20,7 +20,8 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014, Delphix. All rights reserved.
+ * Copyright (c) 2021, George Amanakis. All rights reserved.
*/
/*
@@ -43,6 +44,16 @@
* calculation when the data is requested, storing the result so future queries
* will be faster.
*
+ * If the head_errlog feature is enabled, a different on-disk format is used.
+ * The error log of each head dataset is stored separately in the zap object
+ * and keyed by the head id. This enables listing every dataset affected in
+ * userland. In order to be able to track whether an error block has been
+ * modified or added to snapshots since it was marked as an error, a new tuple
+ * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
+ * transaction group of an error block on-disk. The birth transaction group is
+ * used by check_filesystem() to assess whether this block was freed,
+ * re-written or added to a snapshot since its marking as an error.
+ *
* This log is then shipped into an nvlist where the key is the dataset name and
* the value is the object name. Userland is then responsible for uniquifying
* this list and displaying it to the user.
@@ -53,7 +64,17 @@
#include <sys/spa_impl.h>
#include <sys/zap.h>
#include <sys/zio.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+/*
+ * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
+ * of on-disk error log entries that will be converted to the new
+ * format when enabling head_errlog. Defaults to 0 which converts
+ * all log entries.
+ */
+static uint32_t spa_upgrade_errlog_limit = 0;
/*
* Convert a bookmark to a string.
@@ -67,9 +88,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
}
/*
- * Convert a string to a bookmark
+ * Convert an err_phys to a string.
+ */
+static void
+errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
+ (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
+}
+
+/*
+ * Convert a string to a err_phys.
+ */
+static void
+name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
+{
+ zep->zb_object = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_birth = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+
+/*
+ * Convert a string to a bookmark.
*/
-#ifdef _KERNEL
static void
name_to_bookmark(char *buf, zbookmark_phys_t *zb)
{
@@ -82,8 +129,74 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
ASSERT(*buf == '\0');
}
+
+#ifdef _KERNEL
+static void
+zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
+{
+ zb->zb_objset = dataset;
+ zb->zb_object = zep->zb_object;
+ zb->zb_level = zep->zb_level;
+ zb->zb_blkid = zep->zb_blkid;
+}
#endif
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+ *obj = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == '\0');
+}
+
+static int
+get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
+ uint64_t *head_dataset_id)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ dsl_pool_config_enter(dp, FTAG);
+ int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+ }
+ ASSERT(head_dataset_id);
+ *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+ }
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ error = dnode_hold(os, zep->zb_object, FTAG, &dn);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
+ NULL);
+
+ if (error == 0 && BP_IS_HOLE(&bp))
+ error = SET_ERROR(ENOENT);
+
+ zep->zb_birth = bp.blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+}
+
/*
* Log an uncorrectable error to the persistent error log. We add it to the
* spa's list of pending errors. The changes are actually synced out to disk
@@ -128,6 +241,276 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
mutex_exit(&spa->spa_errlist_lock);
}
+#ifdef _KERNEL
+static int
+find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
+ uint64_t *birth_txg)
+{
+ objset_t *os;
+ int error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ error = dnode_hold(os, zep->zb_object, FTAG, &dn);
+ if (error != 0)
+ return (error);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
+ NULL);
+
+ if (error == 0 && BP_IS_HOLE(&bp))
+ error = SET_ERROR(ENOENT);
+
+ *birth_txg = bp.blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
+/*
+ * This function serves a double role. If only_count is true, it returns
+ * (in *count) how many times an error block belonging to this filesystem is
+ * referenced by snapshots or clones. If only_count is false, each time the
+ * error block is referenced by a snapshot or clone, it fills the userspace
+ * array at uaddr with the bookmarks of the error blocks. The array is filled
+ * from the back and *count is modified to be the number of unused entries at
+ * the beginning of the array.
+ */
+static int
+check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *count, void *uaddr, boolean_t only_count)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t latest_txg;
+ uint64_t txg_to_consider = spa->spa_syncing_txg;
+ boolean_t check_snapshot = B_TRUE;
+ error = find_birth_txg(ds, zep, &latest_txg);
+ if (error == 0) {
+ if (zep->zb_birth == latest_txg) {
+ /* Block neither free nor rewritten. */
+ if (!only_count) {
+ zbookmark_phys_t zb;
+ zep_to_zb(head_ds, zep, &zb);
+ if (copyout(&zb, (char *)uaddr + (*count - 1)
+ * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EFAULT));
+ }
+ (*count)--;
+ } else {
+ (*count)++;
+ }
+ check_snapshot = B_FALSE;
+ } else {
+ ASSERT3U(zep->zb_birth, <, latest_txg);
+ txg_to_consider = latest_txg;
+ }
+ }
+
+ /* How many snapshots reference this block. */
+ uint64_t snap_count;
+ error = zap_count(spa->spa_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (snap_count == 0) {
+ /* File system has no snapshot. */
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t),
+ KM_SLEEP);
+
+ int aff_snap_count = 0;
+ uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ /* Check only snapshots created from this file system. */
+ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+ snap_obj_txg <= txg_to_consider) {
+
+ dsl_dataset_rele(ds, FTAG);
+ error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
+ if (error != 0)
+ goto out;
+
+ if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds)
+ break;
+
+ boolean_t affected = B_TRUE;
+ if (check_snapshot) {
+ uint64_t blk_txg;
+ error = find_birth_txg(ds, zep, &blk_txg);
+ affected = (error == 0 && zep->zb_birth == blk_txg);
+ }
+
+ if (affected) {
+ snap_obj_array[aff_snap_count] = snap_obj;
+ aff_snap_count++;
+
+ if (!only_count) {
+ zbookmark_phys_t zb;
+ zep_to_zb(snap_obj, zep, &zb);
+ if (copyout(&zb, (char *)uaddr + (*count - 1) *
+ sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ (*count)--;
+ } else {
+ (*count)++;
+ }
+
+ /*
+ * Only clones whose origins were affected could also
+ * have affected snapshots.
+ */
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ error = check_filesystem(spa,
+ za.za_first_integer, zep,
+ count, uaddr, only_count);
+
+ if (error != 0) {
+ zap_cursor_fini(&zc);
+ goto out;
+ }
+ }
+ zap_cursor_fini(&zc);
+ }
+ snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+out:
+ kmem_free(snap_obj_array, sizeof (*snap_obj_array));
+ return (error);
+}
+
+static int
+find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *top_affected_fs)
+{
+ uint64_t oldest_dsobj;
+ int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
+ &oldest_dsobj);
+ if (error != 0)
+ return (error);
+
+ dsl_dataset_t *ds;
+ error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj,
+ FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ *top_affected_fs =
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static int
+process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *count, void *uaddr, boolean_t only_count)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_pool_config_enter(dp, FTAG);
+ uint64_t top_affected_fs;
+
+ int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
+ if (error == 0)
+ error = check_filesystem(spa, top_affected_fs, zep, count,
+ uaddr, only_count);
+
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+}
+
+static uint64_t
+get_errlog_size(spa_t *spa, uint64_t spa_err_obj)
+{
+ if (spa_err_obj == 0)
+ return (0);
+ uint64_t total = 0;
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+
+ zap_cursor_t head_ds_cursor;
+ zap_attribute_t head_ds_attr;
+ zbookmark_err_phys_t head_ds_block;
+
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+
+ for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
+ za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor,
+ &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
+
+ name_to_errphys(head_ds_attr.za_name, &head_ds_block);
+ (void) process_error_block(spa, head_ds, &head_ds_block,
+ &total, NULL, B_TRUE);
+ }
+ zap_cursor_fini(&head_ds_cursor);
+ }
+ zap_cursor_fini(&zc);
+ return (total);
+}
+
+static uint64_t
+get_errlist_size(spa_t *spa, avl_tree_t *tree)
+{
+ if (avl_numnodes(tree) == 0)
+ return (0);
+ uint64_t total = 0;
+
+ spa_error_entry_t *se;
+ for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) {
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_bookmark.zb_object;
+ zep.zb_level = se->se_bookmark.zb_level;
+ zep.zb_blkid = se->se_bookmark.zb_blkid;
+
+ /*
+ * If we cannot find out the head dataset and birth txg of
+ * the present error block, we opt not to error out. In the
+ * next pool sync this information will be retrieved by
+ * sync_error_list() and written to the on-disk error log.
+ */
+ uint64_t head_ds_obj;
+ if (get_head_and_birth_txg(spa, &zep,
+ se->se_bookmark.zb_objset, &head_ds_obj) == 0)
+ (void) process_error_block(spa, head_ds_obj, &zep,
+ &total, NULL, B_TRUE);
+ }
+ return (total);
+}
+#endif
+
/*
* Return the number of errors currently in the error log. This is actually the
* sum of both the last log and the current log, since we don't know the union
@@ -136,83 +519,284 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
uint64_t
spa_get_errlog_size(spa_t *spa)
{
- uint64_t total = 0, count;
+ uint64_t total = 0;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ mutex_enter(&spa->spa_errlog_lock);
+ uint64_t count;
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+ } else {
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+ total += get_errlog_size(spa, spa->spa_errlog_last);
+ total += get_errlog_size(spa, spa->spa_errlog_scrub);
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += get_errlist_size(spa, &spa->spa_errlist_last);
+ total += get_errlist_size(spa, &spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+#endif
+ }
+ return (total);
+}
- mutex_enter(&spa->spa_errlog_lock);
- if (spa->spa_errlog_scrub != 0 &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
- &count) == 0)
- total += count;
-
- if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
- &count) == 0)
- total += count;
- mutex_exit(&spa->spa_errlog_lock);
+/*
+ * This function sweeps through an on-disk error log and stores all bookmarks
+ * as error bookmarks in a new ZAP object. At the end we discard the old one,
+ * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
+ * object.
+ */
+static void
+sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_phys_t zb;
+ uint64_t count;
- mutex_enter(&spa->spa_errlist_lock);
- total += avl_numnodes(&spa->spa_errlist_last);
- total += avl_numnodes(&spa->spa_errlist_scrub);
- mutex_exit(&spa->spa_errlist_lock);
+ *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+ DMU_OT_NONE, 0, tx);
- return (total);
+ /*
+ * If we cannnot perform the upgrade we should clear the old on-disk
+ * error logs.
+ */
+ if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+ return;
+ }
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ if (spa_upgrade_errlog_limit != 0 &&
+ zc.zc_cd == spa_upgrade_errlog_limit)
+ break;
+
+ name_to_bookmark(za.za_name, &zb);
+
+ zbookmark_err_phys_t zep;
+ zep.zb_object = zb.zb_object;
+ zep.zb_level = zb.zb_level;
+ zep.zb_blkid = zb.zb_blkid;
+
+ /*
+ * We cannot use get_head_and_birth_txg() because it will
+ * acquire the pool config lock, which we already have. In case
+ * of an error we simply continue.
+ */
+ uint64_t head_dataset_obj;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds);
+ if (error != 0)
+ continue;
+
+ head_dataset_obj =
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+
+ /*
+ * The objset and the dnode are required for getting the block
+ * pointer, which is used to determine if BP_IS_HOLE(). If
+ * getting the objset or the dnode fails, do not create a
+ * zap entry (presuming we know the dataset) as this may create
+ * spurious errors that we cannot ever resolve. If an error is
+ * truly persistent, it should re-appear after a scan.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ continue;
+ }
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ continue;
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
+ NULL, NULL);
+
+ zep.zb_birth = bp.blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+
+ if (error != 0 || BP_IS_HOLE(&bp))
+ continue;
+
+ uint64_t err_obj;
+ error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
+ head_dataset_obj, &err_obj);
+
+ if (error == ENOENT) {
+ err_obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset,
+ *newobj, head_dataset_obj, err_obj, tx);
+ }
+
+ char buf[64];
+ char *name = "";
+ errphys_to_name(&zep, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset, err_obj,
+ buf, 1, strlen(name) + 1, name, tx);
+ }
+ zap_cursor_fini(&zc);
+
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+}
+
+void
+spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
+{
+ uint64_t newobj = 0;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_last != 0) {
+ sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
+ spa->spa_errlog_last = newobj;
+ }
+
+ if (spa->spa_errlog_scrub != 0) {
+ sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
+ spa->spa_errlog_scrub = newobj;
+ }
+ mutex_exit(&spa->spa_errlog_lock);
}
#ifdef _KERNEL
+/*
+ * If an error block is shared by two datasets it will be counted twice. For
+ * detailed message see spa_get_errlog_size() above.
+ */
static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
{
zap_cursor_t zc;
zap_attribute_t za;
- zbookmark_phys_t zb;
if (obj == 0)
return (0);
- for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
- zap_cursor_retrieve(&zc, &za) == 0;
- zap_cursor_advance(&zc)) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(ENOMEM));
+ }
+
+ zbookmark_phys_t zb;
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)uaddr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(EFAULT));
+ }
+ *count -= 1;
- if (*count == 0) {
- zap_cursor_fini(&zc);
- return (SET_ERROR(ENOMEM));
}
+ zap_cursor_fini(&zc);
+ return (0);
+ }
- name_to_bookmark(za.za_name, &zb);
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
- if (copyout(&zb, (char *)addr +
- (*count - 1) * sizeof (zbookmark_phys_t),
- sizeof (zbookmark_phys_t)) != 0) {
- zap_cursor_fini(&zc);
- return (SET_ERROR(EFAULT));
+ zap_cursor_t head_ds_cursor;
+ zap_attribute_t head_ds_attr;
+
+ uint64_t head_ds_err_obj = za.za_first_integer;
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+ for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
+ head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor,
+ &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
+
+ zbookmark_err_phys_t head_ds_block;
+ name_to_errphys(head_ds_attr.za_name, &head_ds_block);
+ int error = process_error_block(spa, head_ds,
+ &head_ds_block, count, uaddr, B_FALSE);
+
+ if (error != 0) {
+ zap_cursor_fini(&head_ds_cursor);
+ zap_cursor_fini(&zc);
+ return (error);
+ }
}
-
- *count -= 1;
+ zap_cursor_fini(&head_ds_cursor);
}
-
zap_cursor_fini(&zc);
-
return (0);
}
static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
+process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
{
spa_error_entry_t *se;
- for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (se = avl_first(list); se != NULL;
+ se = AVL_NEXT(list, se)) {
- if (*count == 0)
- return (SET_ERROR(ENOMEM));
+ if (*count == 0)
+ return (SET_ERROR(ENOMEM));
- if (copyout(&se->se_bookmark, (char *)addr +
- (*count - 1) * sizeof (zbookmark_phys_t),
- sizeof (zbookmark_phys_t)) != 0)
- return (SET_ERROR(EFAULT));
+ if (copyout(&se->se_bookmark, (char *)uaddr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0)
+ return (SET_ERROR(EFAULT));
- *count -= 1;
+ *count -= 1;
+ }
+ return (0);
}
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_bookmark.zb_object;
+ zep.zb_level = se->se_bookmark.zb_level;
+ zep.zb_blkid = se->se_bookmark.zb_blkid;
+
+ uint64_t head_ds_obj;
+ int error = get_head_and_birth_txg(spa, &zep,
+ se->se_bookmark.zb_objset, &head_ds_obj);
+ if (error != 0)
+ return (error);
+
+ error = process_error_block(spa, head_ds_obj, &zep, count,
+ uaddr, B_FALSE);
+ if (error != 0)
+ return (error);
+ }
return (0);
}
#endif
@@ -229,7 +813,7 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count)
* the error list lock when we are finished.
*/
int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
{
int ret = 0;
@@ -244,10 +828,10 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
mutex_enter(&spa->spa_errlist_lock);
if (!ret)
- ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
count);
if (!ret)
- ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
count);
mutex_exit(&spa->spa_errlist_lock);
@@ -299,35 +883,91 @@ spa_errlog_drain(spa_t *spa)
/*
* Process a list of errors into the current on-disk log.
*/
-static void
+void
sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
{
spa_error_entry_t *se;
char buf[64];
void *cookie;
- if (avl_numnodes(t) != 0) {
- /* create log if necessary */
- if (*obj == 0)
- *obj = zap_create(spa->spa_meta_objset,
- DMU_OT_ERROR_LOG, DMU_OT_NONE,
- 0, tx);
+ if (avl_numnodes(t) == 0)
+ return;
+
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+ DMU_OT_NONE, 0, tx);
- /* add errors to the current log */
+ /* add errors to the current log */
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
char *name = se->se_name ? se->se_name : "";
bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+ (void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
+ strlen(name) + 1, name, tx);
+ }
+ } else {
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_bookmark.zb_object;
+ zep.zb_level = se->se_bookmark.zb_level;
+ zep.zb_blkid = se->se_bookmark.zb_blkid;
+
+ /*
+ * If we cannot find out the head dataset and birth txg
+ * of the present error block, we simply continue.
+ * Reinserting that error block to the error lists,
+ * even if we are not syncing the final txg, results
+ * in duplicate posting of errors.
+ */
+ uint64_t head_dataset_obj;
+ int error = get_head_and_birth_txg(spa, &zep,
+ se->se_bookmark.zb_objset, &head_dataset_obj);
+ if (error != 0)
+ continue;
+
+ uint64_t err_obj;
+ error = zap_lookup_int_key(spa->spa_meta_objset,
+ *obj, head_dataset_obj, &err_obj);
+
+ if (error == ENOENT) {
+ err_obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset,
+ *obj, head_dataset_obj, err_obj, tx);
+ }
+ errphys_to_name(&zep, buf, sizeof (buf));
+
(void) zap_update(spa->spa_meta_objset,
- *obj, buf, 1, strlen(name) + 1, name, tx);
+ err_obj, buf, 1, strlen(name) + 1, name, tx);
}
+ }
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+}
- /* purge the error list */
- cookie = NULL;
- while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(se, sizeof (spa_error_entry_t));
+static void
+delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY0(dmu_object_free(spa->spa_meta_objset,
+ za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
}
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
}
/*
@@ -378,8 +1018,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
*/
if (scrub_finished) {
if (spa->spa_errlog_last != 0)
- VERIFY(dmu_object_free(spa->spa_meta_objset,
- spa->spa_errlog_last, tx) == 0);
+ delete_errlog(spa, spa->spa_errlog_last, tx);
spa->spa_errlog_last = spa->spa_errlog_scrub;
spa->spa_errlog_scrub = 0;
@@ -406,6 +1045,137 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
mutex_exit(&spa->spa_errlog_lock);
}
+static void
+delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
+ dmu_tx_t *tx)
+{
+ if (spa_err_obj == 0)
+ return;
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+ if (head_ds == ds) {
+ (void) zap_remove(spa->spa_meta_objset, spa_err_obj,
+ za.za_name, tx);
+ VERIFY0(dmu_object_free(spa->spa_meta_objset,
+ za.za_first_integer, tx));
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
+{
+ mutex_enter(&spa->spa_errlog_lock);
+ delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
+ delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
+static int
+find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
+ uint64_t *txg)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ while (prev_obj != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ if ((error = dsl_dataset_hold_obj(dp, prev_obj,
+ FTAG, &ds)) == 0 &&
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
+ break;
+
+ if (error != 0)
+ return (error);
+
+ prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ASSERT(prev_obj != 0);
+ *txg = prev_obj_txg;
+ return (0);
+}
+
+static void
+swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
+ old_head, dmu_tx_t *tx)
+{
+ if (spa_err_obj == 0)
+ return;
+
+ uint64_t old_head_errlog;
+ int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
+ old_head, &old_head_errlog);
+
+ /* If no error log, then there is nothing to do. */
+ if (error != 0)
+ return;
+
+ uint64_t txg;
+ error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
+ if (error != 0)
+ return;
+
+ /*
+ * Create an error log if the file system being promoted does not
+ * already have one.
+ */
+ uint64_t new_head_errlog;
+ error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
+ &new_head_errlog);
+
+ if (error != 0) {
+ new_head_errlog = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
+ new_head, new_head_errlog, tx);
+ }
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_err_phys_t err_block;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+
+ char *name = "";
+ name_to_errphys(za.za_name, &err_block);
+ if (err_block.zb_birth < txg) {
+ (void) zap_update(spa->spa_meta_objset, new_head_errlog,
+ za.za_name, 1, strlen(name) + 1, name, tx);
+
+ (void) zap_remove(spa->spa_meta_objset, old_head_errlog,
+ za.za_name, tx);
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
+ dmu_tx_t *tx)
+{
+ mutex_enter(&spa->spa_errlog_lock);
+ swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
+ swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
#if defined(_KERNEL)
/* error handling */
EXPORT_SYMBOL(spa_log_error);
@@ -415,4 +1185,14 @@ EXPORT_SYMBOL(spa_errlog_rotate);
EXPORT_SYMBOL(spa_errlog_drain);
EXPORT_SYMBOL(spa_errlog_sync);
EXPORT_SYMBOL(spa_get_errlists);
+EXPORT_SYMBOL(spa_delete_dataset_errlog);
+EXPORT_SYMBOL(spa_swap_errlog);
+EXPORT_SYMBOL(sync_error_list);
+EXPORT_SYMBOL(spa_upgrade_errlog);
#endif
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, INT, ZMOD_RW,
+ "Limit the number of errors which will be upgraded to the new "
+ "on-disk error log when enabling head_errlog");
+/* END CSTYLED */