aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2022-05-18 22:54:40 +0000
committerMartin Matuska <mm@FreeBSD.org>2022-05-18 22:55:59 +0000
commit716fd348e01c5f2ba125f878a634a753436c2994 (patch)
tree0d738baf7a9ccfd90fa1e622f67e0399f306f024 /sys/contrib/openzfs/module/zfs
parent4e2d3f26bd12610ef8672eefb02814b882a4c29b (diff)
parentc0cf6ed6792e545fd614c2a88cb53756db7e03f8 (diff)
downloadsrc-716fd348e01c5f2ba125f878a634a753436c2994.tar.gz
src-716fd348e01c5f2ba125f878a634a753436c2994.zip
zfs: merge openzfs/zfs@c0cf6ed67
Notable upstream pull request merges: #10662 zvol_wait: Ignore locked zvols #12789 Improve log spacemap load time #12812 Improved zpool status output, list all affected datasets #13277 FreeBSD: Use NDFREE_PNBUF if available #13302 Make zfs_max_recordsize default to 16M #13311 Fix error handling in FreeBSD's get/putpages VOPs #13345 FreeBSD: Fix translation from ABD to physical pages #13373 zfs: holds: dequadratify #13375 Corrected edge case in uncompressed ARC->L2ARC handling #13388 Improve mg_aliquot math #13405 Reduce dbuf_find() lock contention #13406 FreeBSD: use zero_region instead of allocating a dedicated page Obtained from: OpenZFS OpenZFS commit: c0cf6ed6792e545fd614c2a88cb53756db7e03f8
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
-rw-r--r--sys/contrib/openzfs/module/zfs/Makefile.in158
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c47
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c28
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf_stats.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dataset.c65
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_destroy.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c141
-rw-r--r--sys/contrib/openzfs/module/zfs/sa.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_errlog.c910
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_log_spacemap.c231
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c7
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_removal.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/zfeature.c7
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_vnops.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c1
19 files changed, 1202 insertions, 432 deletions
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in
deleted file mode 100644
index 30dc91a7eb59..000000000000
--- a/sys/contrib/openzfs/module/zfs/Makefile.in
+++ /dev/null
@@ -1,158 +0,0 @@
-ifneq ($(KBUILD_EXTMOD),)
-src = @abs_srcdir@
-obj = @abs_builddir@
-mfdir = $(obj)
-else
-mfdir = $(srctree)/$(src)
-endif
-
-MODULE := zfs
-
-obj-$(CONFIG_ZFS) := $(MODULE).o
-
-# Suppress unused-value warnings in sparc64 architecture headers
-ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
-
-$(MODULE)-objs += abd.o
-$(MODULE)-objs += aggsum.o
-$(MODULE)-objs += arc.o
-$(MODULE)-objs += blkptr.o
-$(MODULE)-objs += bplist.o
-$(MODULE)-objs += bpobj.o
-$(MODULE)-objs += bptree.o
-$(MODULE)-objs += btree.o
-$(MODULE)-objs += bqueue.o
-$(MODULE)-objs += dataset_kstats.o
-$(MODULE)-objs += dbuf.o
-$(MODULE)-objs += dbuf_stats.o
-$(MODULE)-objs += ddt.o
-$(MODULE)-objs += ddt_zap.o
-$(MODULE)-objs += dmu.o
-$(MODULE)-objs += dmu_diff.o
-$(MODULE)-objs += dmu_object.o
-$(MODULE)-objs += dmu_objset.o
-$(MODULE)-objs += dmu_recv.o
-$(MODULE)-objs += dmu_redact.o
-$(MODULE)-objs += dmu_send.o
-$(MODULE)-objs += dmu_traverse.o
-$(MODULE)-objs += dmu_tx.o
-$(MODULE)-objs += dmu_zfetch.o
-$(MODULE)-objs += dnode.o
-$(MODULE)-objs += dnode_sync.o
-$(MODULE)-objs += dsl_bookmark.o
-$(MODULE)-objs += dsl_crypt.o
-$(MODULE)-objs += dsl_dataset.o
-$(MODULE)-objs += dsl_deadlist.o
-$(MODULE)-objs += dsl_deleg.o
-$(MODULE)-objs += dsl_destroy.o
-$(MODULE)-objs += dsl_dir.o
-$(MODULE)-objs += dsl_pool.o
-$(MODULE)-objs += dsl_prop.o
-$(MODULE)-objs += dsl_scan.o
-$(MODULE)-objs += dsl_synctask.o
-$(MODULE)-objs += dsl_userhold.o
-$(MODULE)-objs += edonr_zfs.o
-$(MODULE)-objs += fm.o
-$(MODULE)-objs += gzip.o
-$(MODULE)-objs += hkdf.o
-$(MODULE)-objs += lz4.o
-$(MODULE)-objs += lz4_zfs.o
-$(MODULE)-objs += lzjb.o
-$(MODULE)-objs += metaslab.o
-$(MODULE)-objs += mmp.o
-$(MODULE)-objs += multilist.o
-$(MODULE)-objs += objlist.o
-$(MODULE)-objs += pathname.o
-$(MODULE)-objs += range_tree.o
-$(MODULE)-objs += refcount.o
-$(MODULE)-objs += rrwlock.o
-$(MODULE)-objs += sa.o
-$(MODULE)-objs += sha256.o
-$(MODULE)-objs += skein_zfs.o
-$(MODULE)-objs += spa.o
-$(MODULE)-objs += spa_boot.o
-$(MODULE)-objs += spa_checkpoint.o
-$(MODULE)-objs += spa_config.o
-$(MODULE)-objs += spa_errlog.o
-$(MODULE)-objs += spa_history.o
-$(MODULE)-objs += spa_log_spacemap.o
-$(MODULE)-objs += spa_misc.o
-$(MODULE)-objs += spa_stats.o
-$(MODULE)-objs += space_map.o
-$(MODULE)-objs += space_reftree.o
-$(MODULE)-objs += txg.o
-$(MODULE)-objs += uberblock.o
-$(MODULE)-objs += unique.o
-$(MODULE)-objs += vdev.o
-$(MODULE)-objs += vdev_cache.o
-$(MODULE)-objs += vdev_draid.o
-$(MODULE)-objs += vdev_draid_rand.o
-$(MODULE)-objs += vdev_indirect.o
-$(MODULE)-objs += vdev_indirect_births.o
-$(MODULE)-objs += vdev_indirect_mapping.o
-$(MODULE)-objs += vdev_initialize.o
-$(MODULE)-objs += vdev_label.o
-$(MODULE)-objs += vdev_mirror.o
-$(MODULE)-objs += vdev_missing.o
-$(MODULE)-objs += vdev_queue.o
-$(MODULE)-objs += vdev_raidz.o
-$(MODULE)-objs += vdev_raidz_math.o
-$(MODULE)-objs += vdev_raidz_math_scalar.o
-$(MODULE)-objs += vdev_rebuild.o
-$(MODULE)-objs += vdev_removal.o
-$(MODULE)-objs += vdev_root.o
-$(MODULE)-objs += vdev_trim.o
-$(MODULE)-objs += zap.o
-$(MODULE)-objs += zap_leaf.o
-$(MODULE)-objs += zap_micro.o
-$(MODULE)-objs += zcp.o
-$(MODULE)-objs += zcp_get.o
-$(MODULE)-objs += zcp_global.o
-$(MODULE)-objs += zcp_iter.o
-$(MODULE)-objs += zcp_set.o
-$(MODULE)-objs += zcp_synctask.o
-$(MODULE)-objs += zfeature.o
-$(MODULE)-objs += zfs_byteswap.o
-$(MODULE)-objs += zfs_fm.o
-$(MODULE)-objs += zfs_fuid.o
-$(MODULE)-objs += zfs_ioctl.o
-$(MODULE)-objs += zfs_log.o
-$(MODULE)-objs += zfs_onexit.o
-$(MODULE)-objs += zfs_quota.o
-$(MODULE)-objs += zfs_ratelimit.o
-$(MODULE)-objs += zfs_replay.o
-$(MODULE)-objs += zfs_rlock.o
-$(MODULE)-objs += zfs_sa.o
-$(MODULE)-objs += zfs_vnops.o
-$(MODULE)-objs += zil.o
-$(MODULE)-objs += zio.o
-$(MODULE)-objs += zio_checksum.o
-$(MODULE)-objs += zio_compress.o
-$(MODULE)-objs += zio_inject.o
-$(MODULE)-objs += zle.o
-$(MODULE)-objs += zrlock.o
-$(MODULE)-objs += zthr.o
-$(MODULE)-objs += zvol.o
-
-# Suppress incorrect warnings from versions of objtool which are not
-# aware of x86 EVEX prefix instructions used for AVX512.
-OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
-OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y
-
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o
-
-$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o
-$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o
-
-$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o
-$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o
-
-ifeq ($(CONFIG_ALTIVEC),y)
-$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec
-endif
-
-include $(mfdir)/../os/linux/zfs/Makefile
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 79e754c4abcb..af42670cc2c9 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -9337,26 +9337,37 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
}
if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
- cabd = abd_alloc_for_io(asize, ismd);
- tmp = abd_borrow_buf(cabd, asize);
+ /*
+ * In some cases, we can wind up with size > asize, so
+ * we need to opt for the larger allocation option here.
+ *
+ * (We also need abd_return_buf_copy in all cases because
+ * it's an ASSERT() to modify the buffer before returning it
+ * with arc_return_buf(), and all the compressors
+ * write things before deciding to fail compression in nearly
+ * every case.)
+ */
+ cabd = abd_alloc_for_io(size, ismd);
+ tmp = abd_borrow_buf(cabd, size);
psize = zio_compress_data(compress, to_write, tmp, size,
hdr->b_complevel);
- if (psize >= size) {
- abd_return_buf(cabd, tmp, asize);
+ if (psize >= asize) {
+ psize = HDR_GET_PSIZE(hdr);
+ abd_return_buf_copy(cabd, tmp, size);
HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
to_write = cabd;
- abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
- if (size != asize)
- abd_zero_off(to_write, size, asize - size);
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
+ if (psize != asize)
+ abd_zero_off(to_write, psize, asize - psize);
goto encrypt;
}
ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
if (psize < asize)
memset((char *)tmp + psize, 0, asize - psize);
psize = HDR_GET_PSIZE(hdr);
- abd_return_buf_copy(cabd, tmp, asize);
+ abd_return_buf_copy(cabd, tmp, size);
to_write = cabd;
}
@@ -11045,20 +11056,20 @@ EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
- param_get_long, ZMOD_RW, "Min arc size");
+ param_get_long, ZMOD_RW, "Minimum ARC size in bytes");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
- param_get_long, ZMOD_RW, "Max arc size");
+ param_get_long, ZMOD_RW, "Maximum ARC size in bytes");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
- param_get_long, ZMOD_RW, "Metadata limit for arc size");
+ param_get_long, ZMOD_RW, "Metadata limit for ARC size in bytes");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
param_set_arc_long, param_get_long, ZMOD_RW,
- "Percent of arc size for arc meta limit");
+ "Percent of ARC size for ARC meta limit");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
- param_get_long, ZMOD_RW, "Min arc metadata");
+ param_get_long, ZMOD_RW, "Minimum ARC metadata size in bytes");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
"Meta objects to scan for prune");
@@ -11070,16 +11081,16 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
"Meta reclaim strategy");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
- param_get_int, ZMOD_RW, "Seconds before growing arc size");
+ param_get_int, ZMOD_RW, "Seconds before growing ARC size");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
"Disable arc_p adapt dampener");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
- param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
+ param_get_int, ZMOD_RW, "log2(fraction of ARC to reclaim)");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
- "Percent of pagecache to reclaim arc to");
+ "Percent of pagecache to reclaim ARC to");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
@@ -11088,7 +11099,7 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
"Target average block size");
ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
- "Disable compressed arc buffers");
+ "Disable compressed ARC buffers");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
@@ -11149,7 +11160,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
param_get_long, ZMOD_RW, "System free memory target size in bytes");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
- param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
+ param_get_long, ZMOD_RW, "Minimum bytes of dnodes in ARC");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
param_set_arc_long, param_get_long, ZMOD_RW,
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index ee2470b38606..9a273b010fb1 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
hv = dbuf_hash(os, obj, level, blkid);
idx = hv & h->hash_table_mask;
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER);
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ rw_exit(DBUF_HASH_RWLOCK(h, idx));
return (db);
}
mutex_exit(&db->db_mtx);
}
}
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ rw_exit(DBUF_HASH_RWLOCK(h, idx));
return (NULL);
}
@@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
hv = dbuf_hash(os, obj, level, blkid);
idx = hv & h->hash_table_mask;
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER);
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
dbf = dbf->db_hash_next, i++) {
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
mutex_enter(&dbf->db_mtx);
if (dbf->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ rw_exit(DBUF_HASH_RWLOCK(h, idx));
return (dbf);
}
mutex_exit(&dbf->db_mtx);
@@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
mutex_enter(&db->db_mtx);
db->db_hash_next = h->hash_table[idx];
h->hash_table[idx] = db;
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ rw_exit(DBUF_HASH_RWLOCK(h, idx));
uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
DBUF_STAT_MAX(hash_elements_max, he);
@@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
/*
* We mustn't hold db_mtx to maintain lock ordering:
- * DBUF_HASH_MUTEX > db_mtx.
+ * DBUF_HASH_RWLOCK > db_mtx.
*/
ASSERT(zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_state == DB_EVICTING);
ASSERT(!MUTEX_HELD(&db->db_mtx));
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER);
dbp = &h->hash_table[idx];
while ((dbf = *dbp) != db) {
dbp = &dbf->db_hash_next;
@@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
if (h->hash_table[idx] &&
h->hash_table[idx]->db_hash_next == NULL)
DBUF_STAT_BUMPDOWN(hash_chains);
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ rw_exit(DBUF_HASH_RWLOCK(h, idx));
atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
}
@@ -914,8 +914,8 @@ retry:
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+ for (i = 0; i < DBUF_RWLOCKS; i++)
+ rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL);
dbuf_stats_init(h);
@@ -981,8 +981,8 @@ dbuf_fini(void)
dbuf_stats_destroy();
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_destroy(&h->hash_mutexes[i]);
+ for (i = 0; i < DBUF_RWLOCKS; i++)
+ rw_destroy(&h->hash_rwlocks[i]);
#if defined(_KERNEL)
/*
* Large allocations which do not require contiguous pages
@@ -3947,7 +3947,7 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
}
void
-dmu_buf_user_evict_wait()
+dmu_buf_user_evict_wait(void)
{
taskq_wait(dbu_evict_taskq);
}
diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
index fa9a5f08060a..a42750ac8e90 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
@@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
if (size)
buf[0] = 0;
- mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+ rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER);
for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
/*
* Returning ENOMEM will cause the data and header functions
@@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
mutex_exit(&db->db_mtx);
}
- mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+ rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx));
return (error);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index 461feeffb6a3..7d8b2c96bd74 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -86,7 +86,7 @@ static int zfs_dmu_offset_next_sync = 1;
* helps to limit the amount of memory that can be used by prefetching.
* Larger objects should be prefetched a bit at a time.
*/
-static int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
index e836d681e920..ca894c35253c 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -73,12 +73,19 @@
* The SPA supports block sizes up to 16MB. However, very large blocks
* can have an impact on i/o latency (e.g. tying up a spinning disk for
* ~300ms), and also potentially on the memory allocator. Therefore,
- * we do not allow the recordsize to be set larger than zfs_max_recordsize
- * (default 1MB). Larger blocks can be created by changing this tunable,
- * and pools with larger blocks can always be imported and used, regardless
- * of this setting.
+ * we did not allow the recordsize to be set larger than zfs_max_recordsize
+ * (former default: 1MB). Larger blocks could be created by changing this
+ * tunable, and pools with larger blocks could always be imported and used,
+ * regardless of this setting.
+ *
+ * We do, however, still limit it by default to 1M on x86_32, because Linux's
+ * 3/1 memory split doesn't leave much room for 16M chunks.
*/
-int zfs_max_recordsize = 1 * 1024 * 1024;
+#ifdef _ILP32
+int zfs_max_recordsize = 1 * 1024 * 1024;
+#else
+int zfs_max_recordsize = 16 * 1024 * 1024;
+#endif
static int zfs_allow_redacted_dataset_mount = 0;
#define SWITCH64(x, y) \
@@ -3708,6 +3715,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
dsl_dir_rele(odd, FTAG);
promote_rele(ddpa, FTAG);
+
+ /*
+ * Transfer common error blocks from old head to new head.
+ */
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ uint64_t old_head = origin_head->ds_object;
+ uint64_t new_head = hds->ds_object;
+ spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);
+ }
}
/*
@@ -4924,13 +4940,38 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
}
-#if defined(_LP64)
-#define RECORDSIZE_PERM ZMOD_RW
-#else
-/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
-#define RECORDSIZE_PERM ZMOD_RD
-#endif
-ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM,
+/*
+ * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj
+ * dataset whose birth time is >= min_txg.
+ */
+int
+dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg,
+ uint64_t *oldest_dsobj)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ while (prev_obj != 0 && min_txg < prev_obj_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ if ((error = dsl_dataset_hold_obj(dp, prev_obj,
+ FTAG, &ds)) != 0)
+ return (error);
+ prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ *oldest_dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, ZMOD_RW,
"Max allowed record size");
ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,
diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
index b32929b3320c..7dddd8eed5e9 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
@@ -1153,6 +1153,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
dsl_dataset_rele(prev, FTAG);
}
+ /* Delete errlog. */
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG))
+ spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx);
}
void
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index 7ed83b305db7..ab32bfec1310 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -48,10 +48,10 @@
/*
* Metaslab granularity, in bytes. This is roughly similar to what would be
* referred to as the "stripe size" in traditional RAID arrays. In normal
- * operation, we will try to write this amount of data to a top-level vdev
- * before moving on to the next one.
+ * operation, we will try to write this amount of data to each disk before
+ * moving on to the next top-level vdev.
*/
-static unsigned long metaslab_aliquot = 512 << 10;
+static unsigned long metaslab_aliquot = 1024 * 1024;
/*
* For testing, make some blocks above a certain size be gang blocks.
@@ -899,7 +899,8 @@ metaslab_group_activate(metaslab_group_t *mg)
if (++mg->mg_activation_count <= 0)
return;
- mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+ mg->mg_aliquot = metaslab_aliquot * MAX(1,
+ vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
@@ -2750,7 +2751,8 @@ metaslab_fini_flush_data(metaslab_t *msp)
mutex_exit(&spa->spa_flushed_ms_lock);
spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
- spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+ spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
+ metaslab_unflushed_dirty(msp));
}
uint64_t
@@ -3728,50 +3730,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
metaslab_flush_update(msp, tx);
}
-/*
- * Called when the metaslab has been flushed (its own spacemap now reflects
- * all the contents of the pool-wide spacemap log). Updates the metaslab's
- * metadata and any pool-wide related log space map data (e.g. summary,
- * obsolete logs, etc..) to reflect that.
- */
static void
-metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
{
- metaslab_group_t *mg = msp->ms_group;
- spa_t *spa = mg->mg_vd->vdev_spa;
-
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- ASSERT3U(spa_sync_pass(spa), ==, 1);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ ASSERT(spa_syncing_log_sm(spa) != NULL);
+ ASSERT(msp->ms_sm != NULL);
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
- /*
- * Just because a metaslab got flushed, that doesn't mean that
- * it will pass through metaslab_sync_done(). Thus, make sure to
- * update ms_synced_length here in case it doesn't.
- */
- msp->ms_synced_length = space_map_length(msp->ms_sm);
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ metaslab_set_unflushed_dirty(msp, B_TRUE);
+ avl_add(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
- /*
- * We may end up here from metaslab_condense() without the
- * feature being active. In that case this is a no-op.
- */
- if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
- return;
+ spa_log_sm_increment_current_mscount(spa);
+ spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
+}
+void
+metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
ASSERT(spa_syncing_log_sm(spa) != NULL);
ASSERT(msp->ms_sm != NULL);
ASSERT(metaslab_unflushed_txg(msp) != 0);
ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
/* update metaslab's position in our flushing tree */
uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+ boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
mutex_enter(&spa->spa_flushed_ms_lock);
avl_remove(&spa->spa_metaslabs_by_flushed, msp);
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ metaslab_set_unflushed_dirty(msp, dirty);
avl_add(&spa->spa_metaslabs_by_flushed, msp);
mutex_exit(&spa->spa_flushed_ms_lock);
@@ -3779,17 +3776,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
spa_log_sm_increment_current_mscount(spa);
+ /* update log space map summary */
+ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
+ ms_prev_flushed_dirty);
+ spa_log_summary_add_flushed_metaslab(spa, dirty);
+
/* cleanup obsolete logs if any */
- uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
spa_cleanup_old_sm_logs(spa, tx);
- uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
- VERIFY3U(log_blocks_after, <=, log_blocks_before);
+}
- /* update log space map summary */
- uint64_t blocks_gone = log_blocks_before - log_blocks_after;
- spa_log_summary_add_flushed_metaslab(spa);
- spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
- spa_log_summary_decrement_blkcount(spa, blocks_gone);
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ spa_t *spa = mg->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+
+ /*
+ * Just because a metaslab got flushed, that doesn't mean that
+ * it will pass through metaslab_sync_done(). Thus, make sure to
+ * update ms_synced_length here in case it doesn't.
+ */
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+ /*
+ * We may end up here from metaslab_condense() without the
+ * feature being active. In that case this is a no-op.
+ */
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
+ metaslab_unflushed_txg(msp) == 0)
+ return;
+
+ metaslab_unflushed_bump(msp, tx, B_FALSE);
}
boolean_t
@@ -4005,23 +4032,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
ASSERT0(metaslab_allocated_space(msp));
}
- if (metaslab_unflushed_txg(msp) == 0 &&
- spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
- ASSERT(spa_syncing_log_sm(spa) != NULL);
-
- metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
- spa_log_sm_increment_current_mscount(spa);
- spa_log_summary_add_flushed_metaslab(spa);
-
- ASSERT(msp->ms_sm != NULL);
- mutex_enter(&spa->spa_flushed_ms_lock);
- avl_add(&spa->spa_metaslabs_by_flushed, msp);
- mutex_exit(&spa->spa_flushed_ms_lock);
-
- ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
- ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
- }
-
if (!range_tree_is_empty(msp->ms_checkpointing) &&
vd->vdev_checkpoint_sm == NULL) {
ASSERT(spa_has_checkpoint(spa));
@@ -4069,6 +4079,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_t *log_sm = spa_syncing_log_sm(spa);
if (log_sm != NULL) {
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+ if (metaslab_unflushed_txg(msp) == 0)
+ metaslab_unflushed_add(msp, tx);
+ else if (!metaslab_unflushed_dirty(msp))
+ metaslab_unflushed_bump(msp, tx, B_TRUE);
space_map_write(log_sm, alloctree, SM_ALLOC,
vd->vdev_id, tx);
@@ -6131,6 +6145,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
mutex_exit(&mg->mg_ms_disabled_lock);
}
+void
+metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
+{
+ ms->ms_unflushed_dirty = dirty;
+}
+
static void
metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
{
@@ -6167,15 +6187,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
void
metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
{
- spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
-
- if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
- return;
-
ms->ms_unflushed_txg = txg;
metaslab_update_ondisk_flush_data(ms, tx);
}
+boolean_t
+metaslab_unflushed_dirty(metaslab_t *ms)
+{
+ return (ms->ms_unflushed_dirty);
+}
+
uint64_t
metaslab_unflushed_txg(metaslab_t *ms)
{
diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c
index 2b6776581a47..db8c2b831f1d 100644
--- a/sys/contrib/openzfs/module/zfs/sa.c
+++ b/sys/contrib/openzfs/module/zfs/sa.c
@@ -1068,8 +1068,8 @@ sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs,
za.za_num_integers);
break;
}
- VERIFY(ddi_strtoull(za.za_name, NULL, 10,
- (unsigned long long *)&lot_num) == 0);
+ VERIFY0(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num));
(void) sa_add_layout_entry(os, lot_attrs,
za.za_num_integers, lot_num,
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index e69cb5527be8..01114dedef48 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -4355,7 +4355,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
error = spa_ld_log_spacemaps(spa);
if (error != 0) {
- spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+ spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
error);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
}
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
index c6b28ea7d1b8..9e5d1de63c0b 100644
--- a/sys/contrib/openzfs/module/zfs/spa_errlog.c
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -20,7 +20,8 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014, Delphix. All rights reserved.
+ * Copyright (c) 2021, George Amanakis. All rights reserved.
*/
/*
@@ -43,6 +44,16 @@
* calculation when the data is requested, storing the result so future queries
* will be faster.
*
+ * If the head_errlog feature is enabled, a different on-disk format is used.
+ * The error log of each head dataset is stored separately in the zap object
+ * and keyed by the head id. This enables listing every dataset affected in
+ * userland. In order to be able to track whether an error block has been
+ * modified or added to snapshots since it was marked as an error, a new tuple
+ * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
+ * transaction group of an error block on-disk. The birth transaction group is
+ * used by check_filesystem() to assess whether this block was freed,
+ * re-written or added to a snapshot since its marking as an error.
+ *
* This log is then shipped into an nvlist where the key is the dataset name and
* the value is the object name. Userland is then responsible for uniquifying
* this list and displaying it to the user.
@@ -53,7 +64,17 @@
#include <sys/spa_impl.h>
#include <sys/zap.h>
#include <sys/zio.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+/*
+ * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
+ * of on-disk error log entries that will be converted to the new
+ * format when enabling head_errlog. Defaults to 0 which converts
+ * all log entries.
+ */
+static uint32_t spa_upgrade_errlog_limit = 0;
/*
* Convert a bookmark to a string.
@@ -67,9 +88,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
}
/*
- * Convert a string to a bookmark
+ * Convert an err_phys to a string.
+ */
+static void
+errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
+ (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
+}
+
+/*
+ * Convert a string to a err_phys.
+ */
+static void
+name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
+{
+ zep->zb_object = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_birth = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+
+/*
+ * Convert a string to a bookmark.
*/
-#ifdef _KERNEL
static void
name_to_bookmark(char *buf, zbookmark_phys_t *zb)
{
@@ -82,8 +129,74 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
ASSERT(*buf == '\0');
}
+
+#ifdef _KERNEL
+static void
+zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
+{
+ zb->zb_objset = dataset;
+ zb->zb_object = zep->zb_object;
+ zb->zb_level = zep->zb_level;
+ zb->zb_blkid = zep->zb_blkid;
+}
#endif
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+ *obj = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == '\0');
+}
+
+static int
+get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
+ uint64_t *head_dataset_id)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ dsl_pool_config_enter(dp, FTAG);
+ int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+ }
+ ASSERT(head_dataset_id);
+ *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+ }
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ error = dnode_hold(os, zep->zb_object, FTAG, &dn);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
+ NULL);
+
+ if (error == 0 && BP_IS_HOLE(&bp))
+ error = SET_ERROR(ENOENT);
+
+ zep->zb_birth = bp.blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+}
+
/*
* Log an uncorrectable error to the persistent error log. We add it to the
* spa's list of pending errors. The changes are actually synced out to disk
@@ -128,6 +241,276 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
mutex_exit(&spa->spa_errlist_lock);
}
+#ifdef _KERNEL
+static int
+find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
+ uint64_t *birth_txg)
+{
+ objset_t *os;
+ int error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ error = dnode_hold(os, zep->zb_object, FTAG, &dn);
+ if (error != 0)
+ return (error);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
+ NULL);
+
+ if (error == 0 && BP_IS_HOLE(&bp))
+ error = SET_ERROR(ENOENT);
+
+ *birth_txg = bp.blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
+/*
+ * This function serves a double role. If only_count is true, it returns
+ * (in *count) how many times an error block belonging to this filesystem is
+ * referenced by snapshots or clones. If only_count is false, each time the
+ * error block is referenced by a snapshot or clone, it fills the userspace
+ * array at uaddr with the bookmarks of the error blocks. The array is filled
+ * from the back and *count is modified to be the number of unused entries at
+ * the beginning of the array.
+ */
+static int
+check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *count, void *uaddr, boolean_t only_count)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t latest_txg;
+ uint64_t txg_to_consider = spa->spa_syncing_txg;
+ boolean_t check_snapshot = B_TRUE;
+ error = find_birth_txg(ds, zep, &latest_txg);
+ if (error == 0) {
+ if (zep->zb_birth == latest_txg) {
+ /* Block neither free nor rewritten. */
+ if (!only_count) {
+ zbookmark_phys_t zb;
+ zep_to_zb(head_ds, zep, &zb);
+ if (copyout(&zb, (char *)uaddr + (*count - 1)
+ * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EFAULT));
+ }
+ (*count)--;
+ } else {
+ (*count)++;
+ }
+ check_snapshot = B_FALSE;
+ } else {
+ ASSERT3U(zep->zb_birth, <, latest_txg);
+ txg_to_consider = latest_txg;
+ }
+ }
+
+ /* How many snapshots reference this block. */
+ uint64_t snap_count;
+ error = zap_count(spa->spa_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (snap_count == 0) {
+ /* File system has no snapshot. */
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t),
+ KM_SLEEP);
+
+ int aff_snap_count = 0;
+ uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ /* Check only snapshots created from this file system. */
+ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+ snap_obj_txg <= txg_to_consider) {
+
+ dsl_dataset_rele(ds, FTAG);
+ error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
+ if (error != 0)
+ goto out;
+
+ if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds)
+ break;
+
+ boolean_t affected = B_TRUE;
+ if (check_snapshot) {
+ uint64_t blk_txg;
+ error = find_birth_txg(ds, zep, &blk_txg);
+ affected = (error == 0 && zep->zb_birth == blk_txg);
+ }
+
+ if (affected) {
+ snap_obj_array[aff_snap_count] = snap_obj;
+ aff_snap_count++;
+
+ if (!only_count) {
+ zbookmark_phys_t zb;
+ zep_to_zb(snap_obj, zep, &zb);
+ if (copyout(&zb, (char *)uaddr + (*count - 1) *
+ sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ (*count)--;
+ } else {
+ (*count)++;
+ }
+
+ /*
+ * Only clones whose origins were affected could also
+ * have affected snapshots.
+ */
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ error = check_filesystem(spa,
+ za.za_first_integer, zep,
+ count, uaddr, only_count);
+
+ if (error != 0) {
+ zap_cursor_fini(&zc);
+ goto out;
+ }
+ }
+ zap_cursor_fini(&zc);
+ }
+ snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+out:
+ kmem_free(snap_obj_array, sizeof (*snap_obj_array));
+ return (error);
+}
+
+static int
+find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *top_affected_fs)
+{
+ uint64_t oldest_dsobj;
+ int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
+ &oldest_dsobj);
+ if (error != 0)
+ return (error);
+
+ dsl_dataset_t *ds;
+ error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj,
+ FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ *top_affected_fs =
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static int
+process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *count, void *uaddr, boolean_t only_count)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_pool_config_enter(dp, FTAG);
+ uint64_t top_affected_fs;
+
+ int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
+ if (error == 0)
+ error = check_filesystem(spa, top_affected_fs, zep, count,
+ uaddr, only_count);
+
+ dsl_pool_config_exit(dp, FTAG);
+ return (error);
+}
+
+static uint64_t
+get_errlog_size(spa_t *spa, uint64_t spa_err_obj)
+{
+ if (spa_err_obj == 0)
+ return (0);
+ uint64_t total = 0;
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+
+ zap_cursor_t head_ds_cursor;
+ zap_attribute_t head_ds_attr;
+ zbookmark_err_phys_t head_ds_block;
+
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+
+ for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
+ za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor,
+ &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
+
+ name_to_errphys(head_ds_attr.za_name, &head_ds_block);
+ (void) process_error_block(spa, head_ds, &head_ds_block,
+ &total, NULL, B_TRUE);
+ }
+ zap_cursor_fini(&head_ds_cursor);
+ }
+ zap_cursor_fini(&zc);
+ return (total);
+}
+
+static uint64_t
+get_errlist_size(spa_t *spa, avl_tree_t *tree)
+{
+ if (avl_numnodes(tree) == 0)
+ return (0);
+ uint64_t total = 0;
+
+ spa_error_entry_t *se;
+ for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) {
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_bookmark.zb_object;
+ zep.zb_level = se->se_bookmark.zb_level;
+ zep.zb_blkid = se->se_bookmark.zb_blkid;
+
+ /*
+ * If we cannot find out the head dataset and birth txg of
+ * the present error block, we opt not to error out. In the
+ * next pool sync this information will be retrieved by
+ * sync_error_list() and written to the on-disk error log.
+ */
+ uint64_t head_ds_obj;
+ if (get_head_and_birth_txg(spa, &zep,
+ se->se_bookmark.zb_objset, &head_ds_obj) == 0)
+ (void) process_error_block(spa, head_ds_obj, &zep,
+ &total, NULL, B_TRUE);
+ }
+ return (total);
+}
+#endif
+
/*
* Return the number of errors currently in the error log. This is actually the
* sum of both the last log and the current log, since we don't know the union
@@ -136,83 +519,284 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
uint64_t
spa_get_errlog_size(spa_t *spa)
{
- uint64_t total = 0, count;
+ uint64_t total = 0;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ mutex_enter(&spa->spa_errlog_lock);
+ uint64_t count;
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+ } else {
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+ total += get_errlog_size(spa, spa->spa_errlog_last);
+ total += get_errlog_size(spa, spa->spa_errlog_scrub);
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += get_errlist_size(spa, &spa->spa_errlist_last);
+ total += get_errlist_size(spa, &spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+#endif
+ }
+ return (total);
+}
- mutex_enter(&spa->spa_errlog_lock);
- if (spa->spa_errlog_scrub != 0 &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
- &count) == 0)
- total += count;
-
- if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
- &count) == 0)
- total += count;
- mutex_exit(&spa->spa_errlog_lock);
+/*
+ * This function sweeps through an on-disk error log and stores all bookmarks
+ * as error bookmarks in a new ZAP object. At the end we discard the old one,
+ * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
+ * object.
+ */
+static void
+sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_phys_t zb;
+ uint64_t count;
- mutex_enter(&spa->spa_errlist_lock);
- total += avl_numnodes(&spa->spa_errlist_last);
- total += avl_numnodes(&spa->spa_errlist_scrub);
- mutex_exit(&spa->spa_errlist_lock);
+ *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+ DMU_OT_NONE, 0, tx);
- return (total);
+ /*
+ * If we cannnot perform the upgrade we should clear the old on-disk
+ * error logs.
+ */
+ if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+ return;
+ }
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ if (spa_upgrade_errlog_limit != 0 &&
+ zc.zc_cd == spa_upgrade_errlog_limit)
+ break;
+
+ name_to_bookmark(za.za_name, &zb);
+
+ zbookmark_err_phys_t zep;
+ zep.zb_object = zb.zb_object;
+ zep.zb_level = zb.zb_level;
+ zep.zb_blkid = zb.zb_blkid;
+
+ /*
+ * We cannot use get_head_and_birth_txg() because it will
+ * acquire the pool config lock, which we already have. In case
+ * of an error we simply continue.
+ */
+ uint64_t head_dataset_obj;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds);
+ if (error != 0)
+ continue;
+
+ head_dataset_obj =
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+
+ /*
+ * The objset and the dnode are required for getting the block
+ * pointer, which is used to determine if BP_IS_HOLE(). If
+ * getting the objset or the dnode fails, do not create a
+ * zap entry (presuming we know the dataset) as this may create
+ * spurious errors that we cannot ever resolve. If an error is
+ * truly persistent, it should re-appear after a scan.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ continue;
+ }
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ continue;
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
+ NULL, NULL);
+
+ zep.zb_birth = bp.blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+
+ if (error != 0 || BP_IS_HOLE(&bp))
+ continue;
+
+ uint64_t err_obj;
+ error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
+ head_dataset_obj, &err_obj);
+
+ if (error == ENOENT) {
+ err_obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset,
+ *newobj, head_dataset_obj, err_obj, tx);
+ }
+
+ char buf[64];
+ char *name = "";
+ errphys_to_name(&zep, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset, err_obj,
+ buf, 1, strlen(name) + 1, name, tx);
+ }
+ zap_cursor_fini(&zc);
+
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+}
+
+void
+spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
+{
+ uint64_t newobj = 0;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_last != 0) {
+ sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
+ spa->spa_errlog_last = newobj;
+ }
+
+ if (spa->spa_errlog_scrub != 0) {
+ sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
+ spa->spa_errlog_scrub = newobj;
+ }
+ mutex_exit(&spa->spa_errlog_lock);
}
#ifdef _KERNEL
+/*
+ * If an error block is shared by two datasets it will be counted twice. For
+ * detailed message see spa_get_errlog_size() above.
+ */
static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
{
zap_cursor_t zc;
zap_attribute_t za;
- zbookmark_phys_t zb;
if (obj == 0)
return (0);
- for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
- zap_cursor_retrieve(&zc, &za) == 0;
- zap_cursor_advance(&zc)) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(ENOMEM));
+ }
+
+ zbookmark_phys_t zb;
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)uaddr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(EFAULT));
+ }
+ *count -= 1;
- if (*count == 0) {
- zap_cursor_fini(&zc);
- return (SET_ERROR(ENOMEM));
}
+ zap_cursor_fini(&zc);
+ return (0);
+ }
- name_to_bookmark(za.za_name, &zb);
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
- if (copyout(&zb, (char *)addr +
- (*count - 1) * sizeof (zbookmark_phys_t),
- sizeof (zbookmark_phys_t)) != 0) {
- zap_cursor_fini(&zc);
- return (SET_ERROR(EFAULT));
+ zap_cursor_t head_ds_cursor;
+ zap_attribute_t head_ds_attr;
+
+ uint64_t head_ds_err_obj = za.za_first_integer;
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+ for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
+ head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor,
+ &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
+
+ zbookmark_err_phys_t head_ds_block;
+ name_to_errphys(head_ds_attr.za_name, &head_ds_block);
+ int error = process_error_block(spa, head_ds,
+ &head_ds_block, count, uaddr, B_FALSE);
+
+ if (error != 0) {
+ zap_cursor_fini(&head_ds_cursor);
+ zap_cursor_fini(&zc);
+ return (error);
+ }
}
-
- *count -= 1;
+ zap_cursor_fini(&head_ds_cursor);
}
-
zap_cursor_fini(&zc);
-
return (0);
}
static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
+process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
{
spa_error_entry_t *se;
- for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (se = avl_first(list); se != NULL;
+ se = AVL_NEXT(list, se)) {
- if (*count == 0)
- return (SET_ERROR(ENOMEM));
+ if (*count == 0)
+ return (SET_ERROR(ENOMEM));
- if (copyout(&se->se_bookmark, (char *)addr +
- (*count - 1) * sizeof (zbookmark_phys_t),
- sizeof (zbookmark_phys_t)) != 0)
- return (SET_ERROR(EFAULT));
+ if (copyout(&se->se_bookmark, (char *)uaddr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0)
+ return (SET_ERROR(EFAULT));
- *count -= 1;
+ *count -= 1;
+ }
+ return (0);
}
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_bookmark.zb_object;
+ zep.zb_level = se->se_bookmark.zb_level;
+ zep.zb_blkid = se->se_bookmark.zb_blkid;
+
+ uint64_t head_ds_obj;
+ int error = get_head_and_birth_txg(spa, &zep,
+ se->se_bookmark.zb_objset, &head_ds_obj);
+ if (error != 0)
+ return (error);
+
+ error = process_error_block(spa, head_ds_obj, &zep, count,
+ uaddr, B_FALSE);
+ if (error != 0)
+ return (error);
+ }
return (0);
}
#endif
@@ -229,7 +813,7 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count)
* the error list lock when we are finished.
*/
int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
{
int ret = 0;
@@ -244,10 +828,10 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
mutex_enter(&spa->spa_errlist_lock);
if (!ret)
- ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
count);
if (!ret)
- ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
count);
mutex_exit(&spa->spa_errlist_lock);
@@ -299,35 +883,91 @@ spa_errlog_drain(spa_t *spa)
/*
* Process a list of errors into the current on-disk log.
*/
-static void
+void
sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
{
spa_error_entry_t *se;
char buf[64];
void *cookie;
- if (avl_numnodes(t) != 0) {
- /* create log if necessary */
- if (*obj == 0)
- *obj = zap_create(spa->spa_meta_objset,
- DMU_OT_ERROR_LOG, DMU_OT_NONE,
- 0, tx);
+ if (avl_numnodes(t) == 0)
+ return;
+
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+ DMU_OT_NONE, 0, tx);
- /* add errors to the current log */
+ /* add errors to the current log */
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
char *name = se->se_name ? se->se_name : "";
bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+ (void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
+ strlen(name) + 1, name, tx);
+ }
+ } else {
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_bookmark.zb_object;
+ zep.zb_level = se->se_bookmark.zb_level;
+ zep.zb_blkid = se->se_bookmark.zb_blkid;
+
+ /*
+ * If we cannot find out the head dataset and birth txg
+ * of the present error block, we simply continue.
+ * Reinserting that error block to the error lists,
+ * even if we are not syncing the final txg, results
+ * in duplicate posting of errors.
+ */
+ uint64_t head_dataset_obj;
+ int error = get_head_and_birth_txg(spa, &zep,
+ se->se_bookmark.zb_objset, &head_dataset_obj);
+ if (error != 0)
+ continue;
+
+ uint64_t err_obj;
+ error = zap_lookup_int_key(spa->spa_meta_objset,
+ *obj, head_dataset_obj, &err_obj);
+
+ if (error == ENOENT) {
+ err_obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset,
+ *obj, head_dataset_obj, err_obj, tx);
+ }
+ errphys_to_name(&zep, buf, sizeof (buf));
+
(void) zap_update(spa->spa_meta_objset,
- *obj, buf, 1, strlen(name) + 1, name, tx);
+ err_obj, buf, 1, strlen(name) + 1, name, tx);
}
+ }
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+}
- /* purge the error list */
- cookie = NULL;
- while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(se, sizeof (spa_error_entry_t));
+static void
+delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY0(dmu_object_free(spa->spa_meta_objset,
+ za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
}
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
}
/*
@@ -378,8 +1018,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
*/
if (scrub_finished) {
if (spa->spa_errlog_last != 0)
- VERIFY(dmu_object_free(spa->spa_meta_objset,
- spa->spa_errlog_last, tx) == 0);
+ delete_errlog(spa, spa->spa_errlog_last, tx);
spa->spa_errlog_last = spa->spa_errlog_scrub;
spa->spa_errlog_scrub = 0;
@@ -406,6 +1045,137 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
mutex_exit(&spa->spa_errlog_lock);
}
+static void
+delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
+ dmu_tx_t *tx)
+{
+ if (spa_err_obj == 0)
+ return;
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+ if (head_ds == ds) {
+ (void) zap_remove(spa->spa_meta_objset, spa_err_obj,
+ za.za_name, tx);
+ VERIFY0(dmu_object_free(spa->spa_meta_objset,
+ za.za_first_integer, tx));
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
+{
+ mutex_enter(&spa->spa_errlog_lock);
+ delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
+ delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
+static int
+find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
+ uint64_t *txg)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ while (prev_obj != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ if ((error = dsl_dataset_hold_obj(dp, prev_obj,
+ FTAG, &ds)) == 0 &&
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
+ break;
+
+ if (error != 0)
+ return (error);
+
+ prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ASSERT(prev_obj != 0);
+ *txg = prev_obj_txg;
+ return (0);
+}
+
+static void
+swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
+ old_head, dmu_tx_t *tx)
+{
+ if (spa_err_obj == 0)
+ return;
+
+ uint64_t old_head_errlog;
+ int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
+ old_head, &old_head_errlog);
+
+ /* If no error log, then there is nothing to do. */
+ if (error != 0)
+ return;
+
+ uint64_t txg;
+ error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
+ if (error != 0)
+ return;
+
+ /*
+ * Create an error log if the file system being promoted does not
+ * already have one.
+ */
+ uint64_t new_head_errlog;
+ error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
+ &new_head_errlog);
+
+ if (error != 0) {
+ new_head_errlog = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
+ new_head, new_head_errlog, tx);
+ }
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_err_phys_t err_block;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+
+ char *name = "";
+ name_to_errphys(za.za_name, &err_block);
+ if (err_block.zb_birth < txg) {
+ (void) zap_update(spa->spa_meta_objset, new_head_errlog,
+ za.za_name, 1, strlen(name) + 1, name, tx);
+
+ (void) zap_remove(spa->spa_meta_objset, old_head_errlog,
+ za.za_name, tx);
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
+ dmu_tx_t *tx)
+{
+ mutex_enter(&spa->spa_errlog_lock);
+ swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
+ swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
#if defined(_KERNEL)
/* error handling */
EXPORT_SYMBOL(spa_log_error);
@@ -415,4 +1185,14 @@ EXPORT_SYMBOL(spa_errlog_rotate);
EXPORT_SYMBOL(spa_errlog_drain);
EXPORT_SYMBOL(spa_errlog_sync);
EXPORT_SYMBOL(spa_get_errlists);
+EXPORT_SYMBOL(spa_delete_dataset_errlog);
+EXPORT_SYMBOL(spa_swap_errlog);
+EXPORT_SYMBOL(sync_error_list);
+EXPORT_SYMBOL(spa_upgrade_errlog);
#endif
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, INT, ZMOD_RW,
+ "Limit the number of errors which will be upgraded to the new "
+ "on-disk error log when enabling head_errlog");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
index 110a4eab99f9..f831509a4247 100644
--- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
+++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
@@ -257,7 +257,12 @@ static unsigned long zfs_unflushed_log_block_min = 1000;
* terms of performance. Thus we have a hard limit in the size of the log in
* terms of blocks.
*/
-static unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+static unsigned long zfs_unflushed_log_block_max = (1ULL << 17);
+
+/*
+ * Also we have a hard limit in the size of the log in terms of dirty TXGs.
+ */
+static unsigned long zfs_unflushed_log_txg_max = 1000;
/*
* Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
@@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa)
return;
}
- uint64_t calculated_limit =
- (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
- spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+ uint64_t msdcount = 0;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e))
+ msdcount += e->lse_msdcount;
+
+ uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
+ spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
}
@@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa)
}
static boolean_t
-summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
{
+ if (e->lse_end == txg)
+ return (0);
+ if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
+ zfs_max_logsm_summary_length))
+ return (1);
uint64_t blocks_per_row = MAX(1,
DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
zfs_max_logsm_summary_length));
@@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
* the metaslab.
*/
void
-spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
{
/*
* We don't track summary data for read-only pools and this function
@@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
}
target->lse_mscount--;
+ if (dirty)
+ target->lse_msdcount--;
}
/*
@@ -490,8 +506,10 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
void
spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
{
- for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
- e != NULL; e = list_head(&spa->spa_log_summary)) {
+ log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ if (e->lse_txgcount > 0)
+ e->lse_txgcount--;
+ for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
if (e->lse_blkcount > blocks_gone) {
/*
* Assert that we stopped at an entry that is not
@@ -560,31 +578,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa)
static void
summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
- uint64_t nblocks)
+ uint64_t metaslabs_dirty, uint64_t nblocks)
{
log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
- if (e == NULL || summary_entry_is_full(spa, e)) {
+ if (e == NULL || summary_entry_is_full(spa, e, txg)) {
e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
- e->lse_start = txg;
+ e->lse_start = e->lse_end = txg;
+ e->lse_txgcount = 1;
list_insert_tail(&spa->spa_log_summary, e);
}
ASSERT3U(e->lse_start, <=, txg);
+ if (e->lse_end < txg) {
+ e->lse_end = txg;
+ e->lse_txgcount++;
+ }
e->lse_mscount += metaslabs_flushed;
+ e->lse_msdcount += metaslabs_dirty;
e->lse_blkcount += nblocks;
}
static void
spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
{
- summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+ summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
}
void
-spa_log_summary_add_flushed_metaslab(spa_t *spa)
+spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
{
- summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+ summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
+}
+
+void
+spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
+{
+ log_summary_entry_t *target = NULL;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+ if (e->lse_start > txg)
+ break;
+ target = e;
+ }
+ ASSERT3P(target, !=, NULL);
+ ASSERT3U(target->lse_mscount, !=, 0);
+ target->lse_msdcount++;
}
/*
@@ -630,6 +669,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
int64_t available_blocks =
spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
+ int64_t available_txgs = zfs_unflushed_log_txg_max;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e))
+ available_txgs -= e->lse_txgcount;
+
/*
* This variable tells us the total number of flushes needed to
* keep the log size within the limit when we reach txgs_in_future.
@@ -637,9 +681,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
uint64_t total_flushes = 0;
/* Holds the current maximum of our estimates so far. */
- uint64_t max_flushes_pertxg =
- MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
- zfs_min_metaslabs_to_flush);
+ uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
/*
* For our estimations we only look as far in the future
@@ -653,11 +695,14 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
* then keep skipping TXGs accumulating more blocks
* based on the incoming rate until we exceed it.
*/
- if (available_blocks >= 0) {
- uint64_t skip_txgs = (available_blocks / incoming) + 1;
+ if (available_blocks >= 0 && available_txgs >= 0) {
+ uint64_t skip_txgs = MIN(available_txgs + 1,
+ (available_blocks / incoming) + 1);
available_blocks -= (skip_txgs * incoming);
+ available_txgs -= skip_txgs;
txgs_in_future += skip_txgs;
ASSERT3S(available_blocks, >=, -incoming);
+ ASSERT3S(available_txgs, >=, -1);
}
/*
@@ -666,9 +711,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
* based on the current entry in the summary, updating
* our available_blocks.
*/
- ASSERT3S(available_blocks, <, 0);
+ ASSERT(available_blocks < 0 || available_txgs < 0);
available_blocks += e->lse_blkcount;
- total_flushes += e->lse_mscount;
+ available_txgs += e->lse_txgcount;
+ total_flushes += e->lse_msdcount;
/*
* Keep the running maximum of the total_flushes that
@@ -680,8 +726,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
*/
max_flushes_pertxg = MAX(max_flushes_pertxg,
DIV_ROUND_UP(total_flushes, txgs_in_future));
- ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
- max_flushes_pertxg);
}
return (max_flushes_pertxg);
}
@@ -771,14 +815,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
uint64_t want_to_flush;
if (spa_flush_all_logs_requested(spa)) {
ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
- want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+ want_to_flush = UINT64_MAX;
} else {
want_to_flush = spa_estimate_metaslabs_to_flush(spa);
}
- ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
- want_to_flush);
-
/* Used purely for verification purposes */
uint64_t visited = 0;
@@ -809,31 +850,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
break;
- mutex_enter(&curr->ms_sync_lock);
- mutex_enter(&curr->ms_lock);
- boolean_t flushed = metaslab_flush(curr, tx);
- mutex_exit(&curr->ms_lock);
- mutex_exit(&curr->ms_sync_lock);
-
- /*
- * If we failed to flush a metaslab (because it was loading),
- * then we are done with the block heuristic as it's not
- * possible to destroy any log space maps once you've skipped
- * a metaslab. In that case we just set our counter to 0 but
- * we continue looping in case there is still memory pressure
- * due to unflushed changes. Note that, flushing a metaslab
- * that is not the oldest flushed in the pool, will never
- * destroy any log space maps [see spa_cleanup_old_sm_logs()].
- */
- if (!flushed) {
- want_to_flush = 0;
- } else if (want_to_flush > 0) {
- want_to_flush--;
- }
+ if (metaslab_unflushed_dirty(curr)) {
+ mutex_enter(&curr->ms_sync_lock);
+ mutex_enter(&curr->ms_lock);
+ metaslab_flush(curr, tx);
+ mutex_exit(&curr->ms_lock);
+ mutex_exit(&curr->ms_sync_lock);
+ if (want_to_flush > 0)
+ want_to_flush--;
+ } else
+ metaslab_unflushed_bump(curr, tx, B_FALSE);
visited++;
}
ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+
+ spa_log_sm_set_blocklimit(spa);
}
/*
@@ -904,6 +936,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
avl_remove(&spa->spa_sm_logs_by_txg, sls);
space_map_free_obj(mos, sls->sls_sm_obj, tx);
VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+ spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
kmem_free(sls, sizeof (spa_log_sm_t));
}
@@ -963,12 +996,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
0, UINT64_MAX, SPA_MINBLOCKSHIFT));
- /*
- * If the log space map feature was just enabled, the blocklimit
- * has not yet been set.
- */
- if (spa_log_sm_blocklimit(spa) == 0)
- spa_log_sm_set_blocklimit(spa);
+ spa_log_sm_set_blocklimit(spa);
}
/*
@@ -1094,12 +1122,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
panic("invalid maptype_t");
break;
}
+ if (!metaslab_unflushed_dirty(ms)) {
+ metaslab_set_unflushed_dirty(ms, B_TRUE);
+ spa_log_summary_dirty_flushed_metaslab(spa,
+ metaslab_unflushed_txg(ms));
+ }
return (0);
}
static int
spa_ld_log_sm_data(spa_t *spa)
{
+ spa_log_sm_t *sls, *psls;
int error = 0;
/*
@@ -1113,41 +1147,71 @@ spa_ld_log_sm_data(spa_t *spa)
ASSERT0(spa->spa_unflushed_stats.sus_memused);
hrtime_t read_logs_starttime = gethrtime();
- /* this is a no-op when we don't have space map logs */
- for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
- sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
- space_map_t *sm = NULL;
- error = space_map_open(&sm, spa_meta_objset(spa),
- sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
- if (error != 0) {
- spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
- "space_map_open(obj=%llu) [error %d]",
- (u_longlong_t)sls->sls_sm_obj, error);
- goto out;
+
+ /* Prefetch log spacemaps dnodes. */
+ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
+ sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
+ 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ }
+
+ uint_t pn = 0;
+ uint64_t ps = 0;
+ psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
+ while (sls != NULL) {
+ /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
+ if (psls != NULL && pn < 16 &&
+ (pn < 2 || ps < 2 * dmu_prefetch_max)) {
+ error = space_map_open(&psls->sls_sm,
+ spa_meta_objset(spa), psls->sls_sm_obj, 0,
+ UINT64_MAX, SPA_MINBLOCKSHIFT);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_ld_log_sm_data(): "
+ "failed at space_map_open(obj=%llu) "
+ "[error %d]",
+ (u_longlong_t)sls->sls_sm_obj, error);
+ goto out;
+ }
+ dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
+ 0, 0, space_map_length(psls->sls_sm),
+ ZIO_PRIORITY_ASYNC_READ);
+ pn++;
+ ps += space_map_length(psls->sls_sm);
+ psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
+ continue;
}
+ /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
+ cond_resched();
+ ASSERT0(sls->sls_nblocks);
+ sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
+ spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+ summary_add_data(spa, sls->sls_txg,
+ sls->sls_mscount, 0, sls->sls_nblocks);
+
struct spa_ld_log_sm_arg vla = {
.slls_spa = spa,
.slls_txg = sls->sls_txg
};
- error = space_map_iterate(sm, space_map_length(sm),
- spa_ld_log_sm_cb, &vla);
+ error = space_map_iterate(sls->sls_sm,
+ space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
if (error != 0) {
- space_map_close(sm);
spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
"at space_map_iterate(obj=%llu) [error %d]",
(u_longlong_t)sls->sls_sm_obj, error);
goto out;
}
- ASSERT0(sls->sls_nblocks);
- sls->sls_nblocks = space_map_nblocks(sm);
- spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
- summary_add_data(spa, sls->sls_txg,
- sls->sls_mscount, sls->sls_nblocks);
+ pn--;
+ ps -= space_map_length(sls->sls_sm);
+ space_map_close(sls->sls_sm);
+ sls->sls_sm = NULL;
+ sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
- space_map_close(sm);
+ /* Update log block limits considering just loaded. */
+ spa_log_sm_set_blocklimit(spa);
}
+
hrtime_t read_logs_endtime = gethrtime();
spa_load_note(spa,
"read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
@@ -1157,6 +1221,18 @@ spa_ld_log_sm_data(spa_t *spa)
(longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
out:
+ if (error != 0) {
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ if (sls->sls_sm) {
+ space_map_close(sls->sls_sm);
+ sls->sls_sm = NULL;
+ }
+ }
+ } else {
+ ASSERT0(pn);
+ ASSERT0(ps);
+ }
/*
* Now that the metaslabs contain their unflushed changes:
* [1] recalculate their actual allocated space
@@ -1237,6 +1313,9 @@ spa_ld_unflushed_txgs(vdev_t *vd)
}
ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+ ms->ms_unflushed_dirty = B_FALSE;
+ ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
if (ms->ms_unflushed_txg != 0) {
mutex_enter(&spa->spa_flushed_ms_lock);
avl_add(&spa->spa_metaslabs_by_flushed, ms);
@@ -1300,6 +1379,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
"Lower-bound limit for the maximum amount of blocks allowed in "
"log spacemap (see zfs_unflushed_log_block_max)");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW,
+ "Hard limit (upper-bound) in the size of the space map log "
+ "in terms of dirty TXGs.");
+
ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
"Tunable used to determine the number of blocks that can be used for "
"the spacemap log, expressed as a percentage of the total number of "
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index db2d2c5e44fb..ce7f020a0d86 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -1523,13 +1523,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
- /*
- * Regardless whether this vdev was just added or it is being
- * expanded, the metaslab count has changed. Recalculate the
- * block limit.
- */
- spa_log_sm_set_blocklimit(spa);
-
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index 17f9d6c90804..5508d273758d 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -1386,7 +1386,6 @@ vdev_remove_complete(spa_t *spa)
vdev_metaslab_fini(vd);
metaslab_group_destroy(vd->vdev_mg);
vd->vdev_mg = NULL;
- spa_log_sm_set_blocklimit(spa);
}
if (vd->vdev_log_mg != NULL) {
ASSERT0(vd->vdev_ms_count);
@@ -2131,7 +2130,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
* metaslab_class_histogram_verify()
*/
vdev_metaslab_fini(vd);
- spa_log_sm_set_blocklimit(spa);
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
*txg = spa_vdev_config_enter(spa);
@@ -2251,7 +2249,6 @@ spa_vdev_remove_top_check(vdev_t *vd)
* and not be raidz or draid.
*/
vdev_t *rvd = spa->spa_root_vdev;
- int num_indirect = 0;
for (uint64_t id = 0; id < rvd->vdev_children; id++) {
vdev_t *cvd = rvd->vdev_child[id];
@@ -2267,8 +2264,6 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (cvd->vdev_ashift != 0 &&
cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
- if (cvd->vdev_ops == &vdev_indirect_ops)
- num_indirect++;
if (!vdev_is_concrete(cvd))
continue;
if (vdev_get_nparity(cvd) != 0)
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
index 9d16fff81d0a..fc9167aa6611 100644
--- a/sys/contrib/openzfs/module/zfs/zfeature.c
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -389,6 +389,13 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
!spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) &&
feature->fi_feature == SPA_FEATURE_BOOKMARK_V2)
spa->spa_errata = 0;
+
+ /*
+ * Convert the old on-disk error log to the new format when activating
+ * the head_errlog feature.
+ */
+ if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG)
+ spa_upgrade_errlog(spa, tx);
}
static void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index a2824c5cc804..b3f32d64f3ef 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -5670,7 +5670,7 @@ zfs_ioc_error_log(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
- size_t count = (size_t)zc->zc_nvlist_dst_size;
+ uint64_t count = zc->zc_nvlist_dst_size;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
index 62806e9fe8b1..a039b4da2833 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ atomic_inc_32(&zp->z_sync_writes_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
+ atomic_dec_32(&zp->z_sync_writes_cnt);
ZFS_EXIT(zfsvfs);
}
tsd_set(zfs_fsyncer_key, NULL);
@@ -357,11 +359,11 @@ zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
vattr_t va = {0};
- va.va_mask = AT_MODE;
+ va.va_mask = ATTR_MODE;
va.va_nodeid = zp->z_id;
va.va_mode = newmode;
- zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE,
- NULL);
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
+ ATTR_MODE, NULL);
*clear_setid_bits_txgp = dmu_tx_get_txg(tx);
}
} else {
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index f6adea572418..2a16d5cef2e2 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -166,15 +166,6 @@ zio_init(void)
cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
KMC_NODEBUG : 0;
-#if defined(_ILP32) && defined(_KERNEL)
- /*
- * Cache size limited to 1M on 32-bit platforms until ARC
- * buffers no longer require virtual address space.
- */
- if (size > zfs_max_recordsize)
- break;
-#endif
-
while (!ISP2(p2))
p2 &= p2 - 1;
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index eb68b05c567b..ac7c3a0c3232 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -513,6 +513,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_MKDIR_ATTR */
zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
zvol_replay_err, /* TX_WRITE2 */
+ zvol_replay_err, /* TX_SETSAXATTR */
};
/*